File: _subsample.pyx

package info (click to toggle)
python-biom-format 2.1.7%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 51,820 kB
  • sloc: python: 12,757; makefile: 155; sh: 79
file content (67 lines) | stat: -rw-r--r-- 2,084 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -----------------------------------------------------------------------------
# Copyright (c) 2011-2017, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------

from __future__ import division

import numpy as np
cimport numpy as cnp


def _subsample(arr, n, with_replacement):
    """Subsample non-zero values of a sparse array

    Parameters
    ----------
    arr : {csr_matrix, csc_matrix}
        A 1xM sparse vector
    n : int
        Number of items to subsample from `arr`
    
    Returns
    -------
    ndarray
        Subsampled data

    Notes
    -----
    This code was adapted from scikit-bio (`skbio.math._subsample`)

    """
    cdef:
        cnp.int64_t counts_sum
        cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
        cnp.ndarray[cnp.int32_t, ndim=1] data_i = arr.data.astype(np.int32)
        cnp.ndarray[cnp.float64_t, ndim=1] result
        cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
        cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
        cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked, r
        cnp.float64_t cnt
        Py_ssize_t i, j, length

    for i in range(indptr.shape[0] - 1):
        start, end = indptr[i], indptr[i+1]
        length = end - start
        counts_sum = data[start:end].sum()
        
        if with_replacement:
            pvals = data[start:end] / counts_sum
            data[start:end] = np.random.multinomial(n, pvals)
        else:
            if counts_sum < n:
                data[start:end] = 0
                continue

            r = np.arange(length, dtype=np.int32)
            unpacked = np.repeat(r, data_i[start:end])
            permuted = np.random.permutation(unpacked)[:n]

            result = np.zeros(length, dtype=np.float64)
            for idx in range(permuted.shape[0]):
                result[permuted[idx]] += 1

            data[start:end] = result