File: stats.py

package info (click to toggle)
scikit-learn 0.18-5
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 71,040 kB
  • ctags: 91,142
  • sloc: python: 97,257; ansic: 8,360; cpp: 5,649; makefile: 242; sh: 238
file content (59 lines) | stat: -rw-r--r-- 1,692 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
from scipy.stats import rankdata as _sp_rankdata
from .fixes import bincount


# To remove when we support scipy 0.13
def _rankdata(a, method="average"):
    """Assign ranks to data, dealing with ties appropriately.

    Ranks begin at 1. The method argument controls how ranks are assigned
    to equal values.

    Parameters
    ----------
    a : array_like
        The array of values to be ranked. The array is first flattened.

    method : str, optional
        The method used to assign ranks to tied elements.
        The options are 'max'.
        'max': The maximum of the ranks that would have been assigned
              to all the tied values is assigned to each value.

    Returns
    -------
    ranks : ndarray
        An array of length equal to the size of a, containing rank scores.

    Notes
    -----
    We only backport the 'max' method

    """
    if method != "max":
        raise NotImplementedError()

    unique_all, inverse = np.unique(a, return_inverse=True)
    count = bincount(inverse, minlength=unique_all.size)
    cum_count = count.cumsum()
    rank = cum_count[inverse]
    return rank

try:
    _sp_rankdata([1.], 'max')
    rankdata = _sp_rankdata

except TypeError as e:
    rankdata = _rankdata


def _weighted_percentile(array, sample_weight, percentile=50):
    """Compute the weighted ``percentile`` of ``array`` with ``sample_weight``. """
    sorted_idx = np.argsort(array)

    # Find index of median prediction for each sample
    weight_cdf = sample_weight[sorted_idx].cumsum()
    percentile_idx = np.searchsorted(
        weight_cdf, (percentile / 100.) * weight_cdf[-1])
    return array[sorted_idx[percentile_idx]]