# Author: Leland McInnes <leland.mcinnes@gmail.com>
#
# License: BSD 3 clause
import numba
import numpy as np
import scipy.stats
from sklearn.metrics import pairwise_distances

_mock_identity = np.eye(2, dtype=np.float64)
_mock_cost = 1.0 - _mock_identity
_mock_ones = np.ones(2, dtype=np.float64)


@numba.njit()
def sign(a):
    if a < 0:
        return -1
    else:
        return 1


@numba.njit(fastmath=True)
def euclidean(x, y):
    r"""Standard euclidean distance.

    ..math::
        D(x, y) = \sqrt{\sum_i (x_i - y_i)^2}
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += (x[i] - y[i]) ** 2
    return np.sqrt(result)


@numba.njit(fastmath=True)
def euclidean_grad(x, y):
    r"""Standard euclidean distance and its gradient.

    ..math::
        D(x, y) = \sqrt{\sum_i (x_i - y_i)^2}
        \frac{dD(x, y)}{dx} = (x_i - y_i)/D(x,y)
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += (x[i] - y[i]) ** 2
    d = np.sqrt(result)
    grad = (x - y) / (1e-6 + d)
    return d, grad


@numba.njit()
def standardised_euclidean(x, y, sigma=_mock_ones):
    r"""Euclidean distance standardised against a vector of standard
    deviations per coordinate.

    ..math::
        D(x, y) = \sqrt{\sum_i \frac{(x_i - y_i)**2}{v_i}}
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += ((x[i] - y[i]) ** 2) / sigma[i]

    return np.sqrt(result)


@numba.njit(fastmath=True)
def standardised_euclidean_grad(x, y, sigma=_mock_ones):
    r"""Euclidean distance standardised against a vector of standard
    deviations per coordinate with gradient.

    ..math::
        D(x, y) = \sqrt{\sum_i \frac{(x_i - y_i)**2}{v_i}}
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += (x[i] - y[i]) ** 2 / sigma[i]
    d = np.sqrt(result)
    grad = (x - y) / (1e-6 + d * sigma)
    return d, grad


@numba.njit()
def manhattan(x, y):
    r"""Manhattan, taxicab, or l1 distance.

    ..math::
        D(x, y) = \sum_i |x_i - y_i|
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += np.abs(x[i] - y[i])

    return result


@numba.njit()
def manhattan_grad(x, y):
    r"""Manhattan, taxicab, or l1 distance with gradient.

    ..math::
        D(x, y) = \sum_i |x_i - y_i|
    """
    result = 0.0
    grad = np.zeros(x.shape)
    for i in range(x.shape[0]):
        result += np.abs(x[i] - y[i])
        grad[i] = np.sign(x[i] - y[i])
    return result, grad


@numba.njit()
def chebyshev(x, y):
    r"""Chebyshev or l-infinity distance.

    ..math::
        D(x, y) = \max_i |x_i - y_i|
    """
    result = 0.0
    for i in range(x.shape[0]):
        result = max(result, np.abs(x[i] - y[i]))

    return result


@numba.njit()
def chebyshev_grad(x, y):
    r"""Chebyshev or l-infinity distance with gradient.

    ..math::
        D(x, y) = \max_i |x_i - y_i|
    """
    result = 0.0
    max_i = 0
    for i in range(x.shape[0]):
        v = np.abs(x[i] - y[i])
        if v > result:
            result = v
            max_i = i
    grad = np.zeros(x.shape)
    grad[max_i] = np.sign(x[max_i] - y[max_i])

    return result, grad


@numba.njit()
def minkowski(x, y, p=2):
    r"""Minkowski distance.

    ..math::
        D(x, y) = \left(\sum_i |x_i - y_i|^p\right)^{\frac{1}{p}}

    This is a general distance. For p=1 it is equivalent to
    manhattan distance, for p=2 it is Euclidean distance, and
    for p=infinity it is Chebyshev distance. In general it is better
    to use the more specialised functions for those distances.
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += (np.abs(x[i] - y[i])) ** p

    return result ** (1.0 / p)


@numba.njit()
def minkowski_grad(x, y, p=2):
    r"""Minkowski distance with gradient.

    ..math::
        D(x, y) = \left(\sum_i |x_i - y_i|^p\right)^{\frac{1}{p}}

    This is a general distance. For p=1 it is equivalent to
    manhattan distance, for p=2 it is Euclidean distance, and
    for p=infinity it is Chebyshev distance. In general it is better
    to use the more specialised functions for those distances.
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += (np.abs(x[i] - y[i])) ** p

    grad = np.empty(x.shape[0], dtype=np.float32)
    for i in range(x.shape[0]):
        grad[i] = (
            pow(np.abs(x[i] - y[i]), (p - 1.0))
            * sign(x[i] - y[i])
            * pow(result, (1.0 / (p - 1)))
        )

    return result ** (1.0 / p), grad


@numba.njit()
def poincare(u, v):
    r"""Poincare distance.

    ..math::
        \delta (u, v) = 2 \frac{ \lVert  u - v \rVert ^2 }{ ( 1 - \lVert  u \rVert ^2 ) ( 1 - \lVert  v \rVert ^2 ) }
        D(x, y) = \operatorname{arcosh} (1+\delta (u,v))
    """
    sq_u_norm = np.sum(u * u)
    sq_v_norm = np.sum(v * v)
    sq_dist = np.sum(np.power(u - v, 2))
    return np.arccosh(1 + 2 * (sq_dist / ((1 - sq_u_norm) * (1 - sq_v_norm))))


@numba.njit()
def hyperboloid_grad(x, y):
    s = np.sqrt(1 + np.sum(x ** 2))
    t = np.sqrt(1 + np.sum(y ** 2))

    B = s * t
    for i in range(x.shape[0]):
        B -= x[i] * y[i]

    if B <= 1:
        B = 1.0 + 1e-8

    grad_coeff = 1.0 / (np.sqrt(B - 1) * np.sqrt(B + 1))

    # return np.arccosh(B), np.zeros(x.shape[0])

    grad = np.zeros(x.shape[0])
    for i in range(x.shape[0]):
        grad[i] = grad_coeff * (((x[i] * t) / s) - y[i])

    return np.arccosh(B), grad


@numba.njit()
def weighted_minkowski(x, y, w=_mock_ones, p=2):
    r"""A weighted version of Minkowski distance.

    ..math::
        D(x, y) = \left(\sum_i w_i |x_i - y_i|^p\right)^{\frac{1}{p}}

    If weights w_i are inverse standard deviations of data in each dimension
    then this represented a standardised Minkowski distance (and is
    equivalent to standardised Euclidean distance for p=1).
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += w[i] * np.abs(x[i] - y[i]) ** p

    return result ** (1.0 / p)


@numba.njit()
def weighted_minkowski_grad(x, y, w=_mock_ones, p=2):
    r"""A weighted version of Minkowski distance with gradient.

    ..math::
        D(x, y) = \left(\sum_i w_i |x_i - y_i|^p\right)^{\frac{1}{p}}

    If weights w_i are inverse standard deviations of data in each dimension
    then this represented a standardised Minkowski distance (and is
    equivalent to standardised Euclidean distance for p=1).
    """
    result = 0.0
    for i in range(x.shape[0]):
        result += w[i] * (np.abs(x[i] - y[i])) ** p

    grad = np.empty(x.shape[0], dtype=np.float32)
    for i in range(x.shape[0]):
        grad[i] = (
            w[i]
            * pow(np.abs(x[i] - y[i]), (p - 1.0))
            * sign(x[i] - y[i])
            * pow(result, (1.0 / (p - 1)))
        )

    return result ** (1.0 / p), grad


@numba.njit()
def mahalanobis(x, y, vinv=_mock_identity):
    result = 0.0

    diff = np.empty(x.shape[0], dtype=np.float32)

    for i in range(x.shape[0]):
        diff[i] = x[i] - y[i]

    for i in range(x.shape[0]):
        tmp = 0.0
        for j in range(x.shape[0]):
            tmp += vinv[i, j] * diff[j]
        result += tmp * diff[i]

    return np.sqrt(result)


@numba.njit()
def mahalanobis_grad(x, y, vinv=_mock_identity):
    result = 0.0

    diff = np.empty(x.shape[0], dtype=np.float32)

    for i in range(x.shape[0]):
        diff[i] = x[i] - y[i]

    grad_tmp = np.zeros(x.shape)
    for i in range(x.shape[0]):
        tmp = 0.0
        for j in range(x.shape[0]):
            tmp += vinv[i, j] * diff[j]
            grad_tmp[i] += vinv[i, j] * diff[j]
        result += tmp * diff[i]
    dist = np.sqrt(result)
    grad = grad_tmp / (1e-6 + dist)
    return dist, grad


@numba.njit()
def hamming(x, y):
    result = 0.0
    for i in range(x.shape[0]):
        if x[i] != y[i]:
            result += 1.0

    return float(result) / x.shape[0]


@numba.njit()
def canberra(x, y):
    result = 0.0
    for i in range(x.shape[0]):
        denominator = np.abs(x[i]) + np.abs(y[i])
        if denominator > 0:
            result += np.abs(x[i] - y[i]) / denominator

    return result


@numba.njit()
def canberra_grad(x, y):
    result = 0.0
    grad = np.zeros(x.shape)
    for i in range(x.shape[0]):
        denominator = np.abs(x[i]) + np.abs(y[i])
        if denominator > 0:
            result += np.abs(x[i] - y[i]) / denominator
            grad[i] = (
                np.sign(x[i] - y[i]) / denominator
                - np.abs(x[i] - y[i]) * np.sign(x[i]) / denominator ** 2
            )

    return result, grad


@numba.njit()
def bray_curtis(x, y):
    numerator = 0.0
    denominator = 0.0
    for i in range(x.shape[0]):
        numerator += np.abs(x[i] - y[i])
        denominator += np.abs(x[i] + y[i])

    if denominator > 0.0:
        return float(numerator) / denominator
    else:
        return 0.0


@numba.njit()
def bray_curtis_grad(x, y):
    numerator = 0.0
    denominator = 0.0
    for i in range(x.shape[0]):
        numerator += np.abs(x[i] - y[i])
        denominator += np.abs(x[i] + y[i])

    if denominator > 0.0:
        dist = float(numerator) / denominator
        grad = (np.sign(x - y) - dist) / denominator
    else:
        dist = 0.0
        grad = np.zeros(x.shape)

    return dist, grad


@numba.njit()
def jaccard(x, y):
    num_non_zero = 0.0
    num_equal = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_non_zero += x_true or y_true
        num_equal += x_true and y_true

    if num_non_zero == 0.0:
        return 0.0
    else:
        return float(num_non_zero - num_equal) / num_non_zero


@numba.njit()
def matching(x, y):
    num_not_equal = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_not_equal += x_true != y_true

    return float(num_not_equal) / x.shape[0]


@numba.njit()
def dice(x, y):
    num_true_true = 0.0
    num_not_equal = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_true_true += x_true and y_true
        num_not_equal += x_true != y_true

    if num_not_equal == 0.0:
        return 0.0
    else:
        return num_not_equal / (2.0 * num_true_true + num_not_equal)


@numba.njit()
def kulsinski(x, y):
    num_true_true = 0.0
    num_not_equal = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_true_true += x_true and y_true
        num_not_equal += x_true != y_true

    if num_not_equal == 0:
        return 0.0
    else:
        return float(num_not_equal - num_true_true + x.shape[0]) / (
            num_not_equal + x.shape[0]
        )


@numba.njit()
def rogers_tanimoto(x, y):
    num_not_equal = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_not_equal += x_true != y_true

    return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal)


@numba.njit()
def russellrao(x, y):
    num_true_true = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_true_true += x_true and y_true

    if num_true_true == np.sum(x != 0) and num_true_true == np.sum(y != 0):
        return 0.0
    else:
        return float(x.shape[0] - num_true_true) / (x.shape[0])


@numba.njit()
def sokal_michener(x, y):
    num_not_equal = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_not_equal += x_true != y_true

    return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal)


@numba.njit()
def sokal_sneath(x, y):
    num_true_true = 0.0
    num_not_equal = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_true_true += x_true and y_true
        num_not_equal += x_true != y_true

    if num_not_equal == 0.0:
        return 0.0
    else:
        return num_not_equal / (0.5 * num_true_true + num_not_equal)


@numba.njit()
def haversine(x, y):
    if x.shape[0] != 2:
        raise ValueError("haversine is only defined for 2 dimensional data")
    sin_lat = np.sin(0.5 * (x[0] - y[0]))
    sin_long = np.sin(0.5 * (x[1] - y[1]))
    result = np.sqrt(sin_lat ** 2 + np.cos(x[0]) * np.cos(y[0]) * sin_long ** 2)
    return 2.0 * np.arcsin(result)


@numba.njit()
def haversine_grad(x, y):
    # spectral initialization puts many points near the poles
    # currently, adding pi/2 to the latitude avoids problems
    # TODO: reimplement with quaternions to avoid singularity

    if x.shape[0] != 2:
        raise ValueError("haversine is only defined for 2 dimensional data")
    sin_lat = np.sin(0.5 * (x[0] - y[0]))
    cos_lat = np.cos(0.5 * (x[0] - y[0]))
    sin_long = np.sin(0.5 * (x[1] - y[1]))
    cos_long = np.cos(0.5 * (x[1] - y[1]))

    a_0 = np.cos(x[0] + np.pi / 2) * np.cos(y[0] + np.pi / 2) * sin_long ** 2
    a_1 = a_0 + sin_lat ** 2

    d = 2.0 * np.arcsin(np.sqrt(min(max(abs(a_1), 0), 1)))
    denom = np.sqrt(abs(a_1 - 1)) * np.sqrt(abs(a_1))
    grad = (
        np.array(
            [
                (
                    sin_lat * cos_lat
                    - np.sin(x[0] + np.pi / 2)
                    * np.cos(y[0] + np.pi / 2)
                    * sin_long ** 2
                ),
                (
                    np.cos(x[0] + np.pi / 2)
                    * np.cos(y[0] + np.pi / 2)
                    * sin_long
                    * cos_long
                ),
            ]
        )
        / (denom + 1e-6)
    )
    return d, grad


@numba.njit()
def yule(x, y):
    num_true_true = 0.0
    num_true_false = 0.0
    num_false_true = 0.0
    for i in range(x.shape[0]):
        x_true = x[i] != 0
        y_true = y[i] != 0
        num_true_true += x_true and y_true
        num_true_false += x_true and (not y_true)
        num_false_true += (not x_true) and y_true

    num_false_false = x.shape[0] - num_true_true - num_true_false - num_false_true

    if num_true_false == 0.0 or num_false_true == 0.0:
        return 0.0
    else:
        return (2.0 * num_true_false * num_false_true) / (
            num_true_true * num_false_false + num_true_false * num_false_true
        )


@numba.njit()
def cosine(x, y):
    result = 0.0
    norm_x = 0.0
    norm_y = 0.0
    for i in range(x.shape[0]):
        result += x[i] * y[i]
        norm_x += x[i] ** 2
        norm_y += y[i] ** 2

    if norm_x == 0.0 and norm_y == 0.0:
        return 0.0
    elif norm_x == 0.0 or norm_y == 0.0:
        return 1.0
    else:
        return 1.0 - (result / np.sqrt(norm_x * norm_y))


@numba.njit(fastmath=True)
def cosine_grad(x, y):
    result = 0.0
    norm_x = 0.0
    norm_y = 0.0
    for i in range(x.shape[0]):
        result += x[i] * y[i]
        norm_x += x[i] ** 2
        norm_y += y[i] ** 2

    if norm_x == 0.0 and norm_y == 0.0:
        dist = 0.0
        grad = np.zeros(x.shape)
    elif norm_x == 0.0 or norm_y == 0.0:
        dist = 1.0
        grad = np.zeros(x.shape)
    else:
        grad = -(x * result - y * norm_x) / np.sqrt(norm_x ** 3 * norm_y)
        dist = 1.0 - (result / np.sqrt(norm_x * norm_y))

    return dist, grad


@numba.njit()
def correlation(x, y):
    mu_x = 0.0
    mu_y = 0.0
    norm_x = 0.0
    norm_y = 0.0
    dot_product = 0.0

    for i in range(x.shape[0]):
        mu_x += x[i]
        mu_y += y[i]

    mu_x /= x.shape[0]
    mu_y /= x.shape[0]

    for i in range(x.shape[0]):
        shifted_x = x[i] - mu_x
        shifted_y = y[i] - mu_y
        norm_x += shifted_x ** 2
        norm_y += shifted_y ** 2
        dot_product += shifted_x * shifted_y

    if norm_x == 0.0 and norm_y == 0.0:
        return 0.0
    elif dot_product == 0.0:
        return 1.0
    else:
        return 1.0 - (dot_product / np.sqrt(norm_x * norm_y))


@numba.njit()
def hellinger(x, y):
    result = 0.0
    l1_norm_x = 0.0
    l1_norm_y = 0.0

    for i in range(x.shape[0]):
        result += np.sqrt(x[i] * y[i])
        l1_norm_x += x[i]
        l1_norm_y += y[i]

    if l1_norm_x == 0 and l1_norm_y == 0:
        return 0.0
    elif l1_norm_x == 0 or l1_norm_y == 0:
        return 1.0
    else:
        return np.sqrt(1 - result / np.sqrt(l1_norm_x * l1_norm_y))


@numba.njit()
def hellinger_grad(x, y):
    result = 0.0
    l1_norm_x = 0.0
    l1_norm_y = 0.0

    grad_term = np.empty(x.shape[0])

    for i in range(x.shape[0]):
        grad_term[i] = np.sqrt(x[i] * y[i])
        result += grad_term[i]
        l1_norm_x += x[i]
        l1_norm_y += y[i]

    if l1_norm_x == 0 and l1_norm_y == 0:
        dist = 0.0
        grad = np.zeros(x.shape)
    elif l1_norm_x == 0 or l1_norm_y == 0:
        dist = 1.0
        grad = np.zeros(x.shape)
    else:
        dist_denom = np.sqrt(l1_norm_x * l1_norm_y)
        dist = np.sqrt(1 - result / dist_denom)
        grad_denom = 2 * dist
        grad_numer_const = (l1_norm_y * result) / (2 * dist_denom ** 3)

        grad = (grad_numer_const - (y / grad_term * dist_denom)) / grad_denom

    return dist, grad


@numba.njit()
def approx_log_Gamma(x):
    if x == 1:
        return 0
    # x2= 1/(x*x);
    return x * np.log(x) - x + 0.5 * np.log(2.0 * np.pi / x) + 1.0 / (x * 12.0)
    # + x2*(-1.0/360.0) + x2* (1.0/1260.0 + x2*(-1.0/(1680.0)  +\
    #  x2*(1.0/1188.0 + x2*(-691.0/360360.0 + x2*(1.0/156.0 +\
    #  x2*(-3617.0/122400.0 + x2*(43687.0/244188.0 + x2*(-174611.0/125400.0) +\
    #  x2*(77683.0/5796.0 + x2*(-236364091.0/1506960.0 + x2*(657931.0/300.0))))))))))))


@numba.njit()
def log_beta(x, y):
    a = min(x, y)
    b = max(x, y)
    if b < 5:
        value = -np.log(b)
        for i in range(1, int(a)):
            value += np.log(i) - np.log(b + i)
        return value
    else:
        return approx_log_Gamma(x) + approx_log_Gamma(y) - approx_log_Gamma(x + y)


@numba.njit()
def log_single_beta(x):
    return np.log(2.0) * (-2.0 * x + 0.5) + 0.5 * np.log(2.0 * np.pi / x) + 0.125 / x


# + x2*(-1.0/192.0 + x2* (1.0/640.0 + x2*(-17.0/(14336.0) +\
#  x2*(31.0/18432.0 + x2*(-691.0/180224.0 +\
#  x2*(5461.0/425984.0 + x2*(-929569.0/15728640.0 +\
#  x2*(3189151.0/8912896.0 + x2*(-221930581.0/79691776.0) +\
#  x2*(4722116521.0/176160768.0 + x2*(-968383680827.0/3087007744.0 +\
#  x2*(14717667114151.0/3355443200.0 ))))))))))))


@numba.njit()
def ll_dirichlet(data1, data2):
    """The symmetric relative log likelihood of rolling data2 vs data1
    in n trials on a die that rolled data1 in sum(data1) trials.

    ..math::
        D(data1, data2) = DirichletMultinomail(data2 | data1)
    """

    n1 = np.sum(data1)
    n2 = np.sum(data2)

    log_b = 0.0
    self_denom1 = 0.0
    self_denom2 = 0.0

    for i in range(data1.shape[0]):
        if data1[i] * data2[i] > 0.9:
            log_b += log_beta(data1[i], data2[i])
            self_denom1 += log_single_beta(data1[i])
            self_denom2 += log_single_beta(data2[i])

        else:
            if data1[i] > 0.9:
                self_denom1 += log_single_beta(data1[i])

            if data2[i] > 0.9:
                self_denom2 += log_single_beta(data2[i])

    return np.sqrt(
        1.0 / n2 * (log_b - log_beta(n1, n2) - (self_denom2 - log_single_beta(n2)))
        + 1.0 / n1 * (log_b - log_beta(n2, n1) - (self_denom1 - log_single_beta(n1)))
    )


@numba.njit(fastmath=True)
def symmetric_kl(x, y, z=1e-11):  # pragma: no cover
    r"""
    symmetrized KL divergence between two probability distributions

    ..math::
        D(x, y) = \frac{D_{KL}\left(x \Vert y\right) + D_{KL}\left(y \Vert x\right)}{2}
    """
    n = x.shape[0]
    x_sum = 0.0
    y_sum = 0.0
    kl1 = 0.0
    kl2 = 0.0

    for i in range(n):
        x[i] += z
        x_sum += x[i]
        y[i] += z
        y_sum += y[i]

    for i in range(n):
        x[i] /= x_sum
        y[i] /= y_sum

    for i in range(n):
        kl1 += x[i] * np.log(x[i] / y[i])
        kl2 += y[i] * np.log(y[i] / x[i])

    return (kl1 + kl2) / 2


@numba.njit(fastmath=True)
def symmetric_kl_grad(x, y, z=1e-11):  # pragma: no cover
    """
    symmetrized KL divergence and its gradient

    """
    n = x.shape[0]
    x_sum = 0.0
    y_sum = 0.0
    kl1 = 0.0
    kl2 = 0.0

    for i in range(n):
        x[i] += z
        x_sum += x[i]
        y[i] += z
        y_sum += y[i]

    for i in range(n):
        x[i] /= x_sum
        y[i] /= y_sum

    for i in range(n):
        kl1 += x[i] * np.log(x[i] / y[i])
        kl2 += y[i] * np.log(y[i] / x[i])

    dist = (kl1 + kl2) / 2
    grad = (np.log(y / x) - (x / y) + 1) / 2

    return dist, grad


@numba.njit()
def correlation_grad(x, y):
    mu_x = 0.0
    mu_y = 0.0
    norm_x = 0.0
    norm_y = 0.0
    dot_product = 0.0

    for i in range(x.shape[0]):
        mu_x += x[i]
        mu_y += y[i]

    mu_x /= x.shape[0]
    mu_y /= x.shape[0]

    for i in range(x.shape[0]):
        shifted_x = x[i] - mu_x
        shifted_y = y[i] - mu_y
        norm_x += shifted_x ** 2
        norm_y += shifted_y ** 2
        dot_product += shifted_x * shifted_y

    if norm_x == 0.0 and norm_y == 0.0:
        dist = 0.0
        grad = np.zeros(x.shape)
    elif dot_product == 0.0:
        dist = 1.0
        grad = np.zeros(x.shape)
    else:
        dist = 1.0 - (dot_product / np.sqrt(norm_x * norm_y))
        grad = ((x - mu_x) / norm_x - (y - mu_y) / dot_product) * dist

    return dist, grad


@numba.njit(fastmath=True)
def sinkhorn_distance(
    x, y, M=_mock_identity, cost=_mock_cost, maxiter=64
):  # pragma: no cover
    p = (x / x.sum()).astype(np.float32)
    q = (y / y.sum()).astype(np.float32)

    u = np.ones(p.shape, dtype=np.float32)
    v = np.ones(q.shape, dtype=np.float32)

    for n in range(maxiter):
        t = M @ v
        u[t > 0] = p[t > 0] / t[t > 0]
        t = M.T @ u
        v[t > 0] = q[t > 0] / t[t > 0]

    pi = np.diag(v) @ M @ np.diag(u)
    result = 0.0
    for i in range(pi.shape[0]):
        for j in range(pi.shape[1]):
            if pi[i, j] > 0:
                result += pi[i, j] * cost[i, j]

    return result


@numba.njit(fastmath=True)
def spherical_gaussian_energy_grad(x, y):  # pragma: no cover
    mu_1 = x[0] - y[0]
    mu_2 = x[1] - y[1]

    sigma = np.abs(x[2]) + np.abs(y[2])
    sign_sigma = np.sign(x[2])

    dist = (mu_1 ** 2 + mu_2 ** 2) / (2 * sigma) + np.log(sigma) + np.log(2 * np.pi)
    grad = np.empty(3, np.float32)

    grad[0] = mu_1 / sigma
    grad[1] = mu_2 / sigma
    grad[2] = sign_sigma * (1.0 / sigma - (mu_1 ** 2 + mu_2 ** 2) / (2 * sigma ** 2))

    return dist, grad


@numba.njit(fastmath=True)
def diagonal_gaussian_energy_grad(x, y):  # pragma: no cover
    mu_1 = x[0] - y[0]
    mu_2 = x[1] - y[1]

    sigma_11 = np.abs(x[2]) + np.abs(y[2])
    sigma_12 = 0.0
    sigma_22 = np.abs(x[3]) + np.abs(y[3])

    det = sigma_11 * sigma_22
    sign_s1 = np.sign(x[2])
    sign_s2 = np.sign(x[3])

    if det == 0.0:
        # TODO: figure out the right thing to do here
        return mu_1 ** 2 + mu_2 ** 2, np.array([0.0, 0.0, 1.0, 1.0], dtype=np.float32)

    cross_term = 2 * sigma_12
    m_dist = (
        np.abs(sigma_22) * (mu_1 ** 2)
        - cross_term * mu_1 * mu_2
        + np.abs(sigma_11) * (mu_2 ** 2)
    )

    dist = (m_dist / det + np.log(np.abs(det))) / 2.0 + np.log(2 * np.pi)
    grad = np.empty(6, dtype=np.float32)

    grad[0] = (2 * sigma_22 * mu_1 - cross_term * mu_2) / (2 * det)
    grad[1] = (2 * sigma_11 * mu_2 - cross_term * mu_1) / (2 * det)
    grad[2] = sign_s1 * (sigma_22 * (det - m_dist) + det * mu_2 ** 2) / (2 * det ** 2)
    grad[3] = sign_s2 * (sigma_11 * (det - m_dist) + det * mu_1 ** 2) / (2 * det ** 2)

    return dist, grad


@numba.njit(fastmath=True)
def gaussian_energy_grad(x, y):  # pragma: no cover
    mu_1 = x[0] - y[0]
    mu_2 = x[1] - y[1]

    # Ensure width are positive
    x[2] = np.abs(x[2])
    y[2] = np.abs(y[2])

    # Ensure heights are positive
    x[3] = np.abs(x[3])
    y[3] = np.abs(y[3])

    # Ensure angle is in range -pi,pi
    x[4] = np.arcsin(np.sin(x[4]))
    y[4] = np.arcsin(np.sin(y[4]))

    # Covariance entries for y
    a = y[2] * np.cos(y[4]) ** 2 + y[3] * np.sin(y[4]) ** 2
    b = (y[2] - y[3]) * np.sin(y[4]) * np.cos(y[4])
    c = y[3] * np.cos(y[4]) ** 2 + y[2] * np.sin(y[4]) ** 2

    # Sum of covariance matrices
    sigma_11 = x[2] * np.cos(x[4]) ** 2 + x[3] * np.sin(x[4]) ** 2 + a
    sigma_12 = (x[2] - x[3]) * np.sin(x[4]) * np.cos(x[4]) + b
    sigma_22 = x[2] * np.sin(x[4]) ** 2 + x[3] * np.cos(x[4]) ** 2 + c

    # Determinant of the sum of covariances
    det_sigma = np.abs(sigma_11 * sigma_22 - sigma_12 ** 2)
    x_inv_sigma_y_numerator = (
        sigma_22 * mu_1 ** 2 - 2 * sigma_12 * mu_1 * mu_2 + sigma_11 * mu_2 ** 2
    )

    if det_sigma < 1e-32:
        return (
            mu_1 ** 2 + mu_2 ** 2,
            np.array([0.0, 0.0, 1.0, 1.0, 0.0], dtype=np.float32),
        )

    dist = x_inv_sigma_y_numerator / det_sigma + np.log(det_sigma) + np.log(2 * np.pi)

    grad = np.zeros(5, np.float32)
    grad[0] = (2 * sigma_22 * mu_1 - 2 * sigma_12 * mu_2) / det_sigma
    grad[1] = (2 * sigma_11 * mu_2 - 2 * sigma_12 * mu_1) / det_sigma

    grad[2] = mu_2 * (mu_2 * np.cos(x[4]) ** 2 - mu_1 * np.cos(x[4]) * np.sin(x[4]))
    grad[2] += mu_1 * (mu_1 * np.sin(x[4]) ** 2 - mu_2 * np.cos(x[4]) * np.sin(x[4]))
    grad[2] *= det_sigma
    grad[2] -= x_inv_sigma_y_numerator * np.cos(x[4]) ** 2 * sigma_22
    grad[2] -= x_inv_sigma_y_numerator * np.sin(x[4]) ** 2 * sigma_11
    grad[2] += x_inv_sigma_y_numerator * 2 * sigma_12 * np.sin(x[4]) * np.cos(x[4])
    grad[2] /= det_sigma ** 2 + 1e-8

    grad[3] = mu_1 * (mu_1 * np.cos(x[4]) ** 2 - mu_2 * np.cos(x[4]) * np.sin(x[4]))
    grad[3] += mu_2 * (mu_2 * np.sin(x[4]) ** 2 - mu_1 * np.cos(x[4]) * np.sin(x[4]))
    grad[3] *= det_sigma
    grad[3] -= x_inv_sigma_y_numerator * np.sin(x[4]) ** 2 * sigma_22
    grad[3] -= x_inv_sigma_y_numerator * np.cos(x[4]) ** 2 * sigma_11
    grad[3] -= x_inv_sigma_y_numerator * 2 * sigma_12 * np.sin(x[4]) * np.cos(x[4])
    grad[3] /= det_sigma ** 2 + 1e-8

    grad[4] = (x[3] - x[2]) * (
        2 * mu_1 * mu_2 * np.cos(2 * x[4]) - (mu_1 ** 2 - mu_2 ** 2) * np.sin(2 * x[4])
    )
    grad[4] *= det_sigma
    grad[4] -= x_inv_sigma_y_numerator * (x[3] - x[2]) * np.sin(2 * x[4]) * sigma_22
    grad[4] -= x_inv_sigma_y_numerator * (x[2] - x[3]) * np.sin(2 * x[4]) * sigma_11
    grad[4] -= x_inv_sigma_y_numerator * 2 * sigma_12 * (x[2] - x[3]) * np.cos(2 * x[4])
    grad[4] /= det_sigma ** 2 + 1e-8

    return dist, grad


@numba.njit(fastmath=True)
def spherical_gaussian_grad(x, y):  # pragma: no cover
    mu_1 = x[0] - y[0]
    mu_2 = x[1] - y[1]

    sigma = x[2] + y[2]
    sigma_sign = np.sign(sigma)

    if sigma == 0:
        return 10.0, np.array([0.0, 0.0, -1.0], dtype=np.float32)

    dist = (
        (mu_1 ** 2 + mu_2 ** 2) / np.abs(sigma)
        + 2 * np.log(np.abs(sigma))
        + np.log(2 * np.pi)
    )
    grad = np.empty(3, dtype=np.float32)

    grad[0] = (2 * mu_1) / np.abs(sigma)
    grad[1] = (2 * mu_2) / np.abs(sigma)
    grad[2] = sigma_sign * (
        -(mu_1 ** 2 + mu_2 ** 2) / (sigma ** 2) + (2 / np.abs(sigma))
    )

    return dist, grad


# Special discrete distances -- where x and y are objects, not vectors


def get_discrete_params(data, metric):
    if metric == "ordinal":
        return {"support_size": float(data.max() - data.min()) / 2.0}
    elif metric == "count":
        min_count = scipy.stats.tmin(data)
        max_count = scipy.stats.tmax(data)
        lambda_ = scipy.stats.tmean(data)
        normalisation = count_distance(min_count, max_count, poisson_lambda=lambda_)
        return {
            "poisson_lambda": lambda_,
            "normalisation": normalisation / 2.0,  # heuristic
        }
    elif metric == "string":
        lengths = np.array([len(x) for x in data])
        max_length = scipy.stats.tmax(lengths)
        max_dist = max_length / 1.5  # heuristic
        normalisation = max_dist / 2.0  # heuristic
        return {"normalisation": normalisation, "max_dist": max_dist / 2.0}  # heuristic

    else:
        return {}


@numba.njit()
def categorical_distance(x, y):
    if x == y:
        return 0.0
    else:
        return 1.0


@numba.njit()
def hierarchical_categorical_distance(x, y, cat_hierarchy=[{}]):
    n_levels = float(len(cat_hierarchy))
    for level, cats in enumerate(cat_hierarchy):
        if cats[x] == cats[y]:
            return float(level) / n_levels
    else:
        return 1.0


@numba.njit()
def ordinal_distance(x, y, support_size=1.0):
    return abs(x - y) / support_size


@numba.njit()
def count_distance(x, y, poisson_lambda=1.0, normalisation=1.0):
    lo = int(min(x, y))
    hi = int(max(x, y))

    log_lambda = np.log(poisson_lambda)

    if lo < 2:
        log_k_factorial = 0.0
    elif lo < 10:
        log_k_factorial = 0.0
        for k in range(2, lo):
            log_k_factorial += np.log(k)
    else:
        log_k_factorial = approx_log_Gamma(lo + 1)

    result = 0.0

    for k in range(lo, hi):
        result += k * log_lambda - poisson_lambda - log_k_factorial
        log_k_factorial += np.log(k)

    return result / normalisation


@numba.njit()
def levenshtein(x, y, normalisation=1.0, max_distance=20):
    x_len, y_len = len(x), len(y)

    # Opt out of some comparisons
    if abs(x_len - y_len) > max_distance:
        return abs(x_len - y_len) / normalisation

    v0 = np.arange(y_len + 1).astype(np.float64)
    v1 = np.zeros(y_len + 1)

    for i in range(x_len):

        v1[i] = i + 1

        for j in range(y_len):
            deletion_cost = v0[j + 1] + 1
            insertion_cost = v1[j] + 1
            substitution_cost = int(x[i] == y[j])

            v1[j + 1] = min(deletion_cost, insertion_cost, substitution_cost)

        v0 = v1

        # Abort early if we've already exceeded max_dist
        if np.min(v0) > max_distance:
            return max_distance / normalisation

    return v0[y_len] / normalisation


named_distances = {
    # general minkowski distances
    "euclidean": euclidean,
    "l2": euclidean,
    "manhattan": manhattan,
    "taxicab": manhattan,
    "l1": manhattan,
    "chebyshev": chebyshev,
    "linfinity": chebyshev,
    "linfty": chebyshev,
    "linf": chebyshev,
    "minkowski": minkowski,
    "poincare": poincare,
    # Standardised/weighted distances
    "seuclidean": standardised_euclidean,
    "standardised_euclidean": standardised_euclidean,
    "wminkowski": weighted_minkowski,
    "weighted_minkowski": weighted_minkowski,
    "mahalanobis": mahalanobis,
    # Other distances
    "canberra": canberra,
    "cosine": cosine,
    "correlation": correlation,
    "hellinger": hellinger,
    "haversine": haversine,
    "braycurtis": bray_curtis,
    "ll_dirichlet": ll_dirichlet,
    "symmetric_kl": symmetric_kl,
    # Binary distances
    "hamming": hamming,
    "jaccard": jaccard,
    "dice": dice,
    "matching": matching,
    "kulsinski": kulsinski,
    "rogerstanimoto": rogers_tanimoto,
    "russellrao": russellrao,
    "sokalsneath": sokal_sneath,
    "sokalmichener": sokal_michener,
    "yule": yule,
    # Special discrete distances
    "categorical": categorical_distance,
    "ordinal": ordinal_distance,
    "hierarchical_categorical": hierarchical_categorical_distance,
    "count": count_distance,
    "string": levenshtein,
}

named_distances_with_gradients = {
    # general minkowski distances
    "euclidean": euclidean_grad,
    "l2": euclidean_grad,
    "manhattan": manhattan_grad,
    "taxicab": manhattan_grad,
    "l1": manhattan_grad,
    "chebyshev": chebyshev_grad,
    "linfinity": chebyshev_grad,
    "linfty": chebyshev_grad,
    "linf": chebyshev_grad,
    "minkowski": minkowski_grad,
    # Standardised/weighted distances
    "seuclidean": standardised_euclidean_grad,
    "standardised_euclidean": standardised_euclidean_grad,
    "wminkowski": weighted_minkowski_grad,
    "weighted_minkowski": weighted_minkowski_grad,
    "mahalanobis": mahalanobis_grad,
    # Other distances
    "canberra": canberra_grad,
    "cosine": cosine_grad,
    "correlation": correlation_grad,
    "hellinger": hellinger_grad,
    "haversine": haversine_grad,
    "braycurtis": bray_curtis_grad,
    "symmetric_kl": symmetric_kl_grad,
    # Special embeddings
    "spherical_gaussian_energy": spherical_gaussian_energy_grad,
    "diagonal_gaussian_energy": diagonal_gaussian_energy_grad,
    "gaussian_energy": gaussian_energy_grad,
    "hyperboloid": hyperboloid_grad,
}

DISCRETE_METRICS = (
    "categorical",
    "hierarchical_categorical",
    "ordinal",
    "count",
    "string",
)

SPECIAL_METRICS = (
    "hellinger",
    "ll_dirichlet",
    "symmetric_kl",
    "poincare",
    hellinger,
    ll_dirichlet,
    symmetric_kl,
    poincare,
)


@numba.njit(parallel=True)
def parallel_special_metric(X, Y=None, metric=hellinger):
    if Y is None:
        result = np.zeros((X.shape[0], X.shape[0]))

        for i in range(X.shape[0]):
            for j in range(i + 1, X.shape[0]):
                result[i, j] = metric(X[i], X[j])
                result[j, i] = result[i, j]
    else:
        result = np.zeros((X.shape[0], Y.shape[0]))

        for i in range(X.shape[0]):
            for j in range(Y.shape[0]):
                result[i, j] = metric(X[i], Y[j])

    return result


# We can gain efficiency by chunking the matrix into blocks;
# this keeps data vectors in cache better
@numba.njit(parallel=True, nogil=True)
def chunked_parallel_special_metric(X, Y=None, metric=hellinger, chunk_size=16):
    if Y is None:
        XX, symmetrical = X, True
        row_size = col_size = X.shape[0]
    else:
        XX, symmetrical = Y, False
        row_size, col_size = X.shape[0], Y.shape[0]

    result = np.zeros((row_size, col_size), dtype=np.float32)
    n_row_chunks = (row_size // chunk_size) + 1
    for chunk_idx in numba.prange(n_row_chunks):
        n = chunk_idx * chunk_size
        chunk_end_n = min(n + chunk_size, row_size)
        m_start = n if symmetrical else 0
        for m in range(m_start, col_size, chunk_size):
            chunk_end_m = min(m + chunk_size, col_size)
            for i in range(n, chunk_end_n):
                for j in range(m, chunk_end_m):
                    result[i, j] = metric(X[i], XX[j])
    return result


def pairwise_special_metric(X, Y=None, metric="hellinger", kwds=None, force_all_finite=True):
    if callable(metric):
        if kwds is not None:
            kwd_vals = tuple(kwds.values())
        else:
            kwd_vals = ()

        @numba.njit(fastmath=True)
        def _partial_metric(_X, _Y=None):
            return metric(_X, _Y, *kwd_vals)

        return pairwise_distances(X, Y, metric=_partial_metric, force_all_finite=force_all_finite)
    else:
        special_metric_func = named_distances[metric]
    return parallel_special_metric(X, Y, metric=special_metric_func)