File: stats.py

package info (click to toggle)

scikit-learn 0.18-5

links: PTS, VCS
area: main
in suites: stretch
size: 71,040 kB
ctags: 91,142
sloc: python: 97,257; ansic: 8,360; cpp: 5,649; makefile: 242; sh: 238

file content (59 lines) | stat: -rw-r--r-- 1,692 bytes

import numpy as np
from scipy.stats import rankdata as _sp_rankdata
from .fixes import bincount


# To remove when we support scipy 0.13
def _rankdata(a, method="average"):
    """Assign ranks to data, dealing with ties appropriately.

    Ranks begin at 1. The method argument controls how ranks are assigned
    to equal values.

    Parameters
    ----------
    a : array_like
        The array of values to be ranked. The array is first flattened.

    method : str, optional
        The method used to assign ranks to tied elements.
        The options are 'max'.
        'max': The maximum of the ranks that would have been assigned
              to all the tied values is assigned to each value.

    Returns
    -------
    ranks : ndarray
        An array of length equal to the size of a, containing rank scores.

    Notes
    -----
    We only backport the 'max' method

    """
    if method != "max":
        raise NotImplementedError()

    unique_all, inverse = np.unique(a, return_inverse=True)
    count = bincount(inverse, minlength=unique_all.size)
    cum_count = count.cumsum()
    rank = cum_count[inverse]
    return rank

try:
    _sp_rankdata([1.], 'max')
    rankdata = _sp_rankdata

except TypeError as e:
    rankdata = _rankdata


def _weighted_percentile(array, sample_weight, percentile=50):
    """Compute the weighted ``percentile`` of ``array`` with ``sample_weight``. """
    sorted_idx = np.argsort(array)

    # Find index of median prediction for each sample
    weight_cdf = sample_weight[sorted_idx].cumsum()
    percentile_idx = np.searchsorted(
        weight_cdf, (percentile / 100.) * weight_cdf[-1])
    return array[sorted_idx[percentile_idx]]