File: _subsample.pyx

package info (click to toggle)
python-biom-format 2.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 1,548 kB
  • ctags: 1,051
  • sloc: python: 6,257; makefile: 137; sh: 52
file content (69 lines) | stat: -rw-r--r-- 1,946 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, biom development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import division

import numpy as np
cimport numpy as cnp


def _subsample(arr, n):
    """Subsample non-zero values of a sparse array

    Parameters
    ----------
    arr : {csr_matrix, csc_matrix}
        A 1xM sparse vector
    n : int
        Number of items to subsample from `arr`
    
    Returns
    -------
    ndarray
        Subsampled data

    Notes
    -----
    This code was adapted from scikit-bio (`skbio.math._subsample`)

    """
    cdef:
        cnp.int64_t counts_sum
        cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
        cnp.ndarray[cnp.float64_t, ndim=1] result
        cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
        cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
        cnp.ndarray[cnp.int32_t, ndim=1] permuted, unpacked
        cnp.float64_t cnt
        Py_ssize_t unpacked_idx, i, j

    for i in range(indptr.shape[0] - 1):
        start, end = indptr[i], indptr[i+1]
        counts_sum = data[start:end].sum()
       
        if counts_sum < n:
            data[start:end] = 0
            continue

        unpacked = np.empty(counts_sum, dtype=np.int32)
        unpacked_idx = 0

        for i in range(start, end):
            cnt = data[i]

            for j in range(int(cnt)):
                unpacked[unpacked_idx] = i - start
                unpacked_idx += 1
       
        permuted = np.random.permutation(unpacked)[:n]
        
        result = np.zeros(end - start, dtype=np.float64)
        for idx in range(permuted.shape[0]):
            result[permuted[idx]] += 1

        data[start:end] = result