File: _subsample.pyx

package info (click to toggle)
python-biom-format 2.1.17-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 52,044 kB
  • sloc: python: 13,624; makefile: 149; sh: 105; javascript: 45
file content (174 lines) | stat: -rw-r--r-- 6,250 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# -----------------------------------------------------------------------------
# Copyright (c) 2011-2017, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------

import numpy as np
cimport numpy as cnp
cnp.import_array()


cdef _subsample_with_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
                                 cnp.ndarray[cnp.int32_t, ndim=1] indptr,
                                 cnp.int64_t n,
                                 object rng):
    """Subsample non-zero values of a sparse array with replacement

    Note: this method operates in place

    Parameters
    ----------
    data : {csr_matrix, csc_matrix}.data
        A 1xM sparse vector data
    indptr : {csr_matrix, csc_matrix}.indptr
        A 1xM sparse vector indptr
    n : int
        Number of items to subsample from `arr`
    rng : Generator instance
        A random generator. This will likely be an instance returned 
        by np.random.default_rng

    Notes
    -----
    This code was adapted from scikit-bio (`skbio.math._subsample`)

    """
    cdef:
        cnp.float64_t counts_sum
        cnp.int32_t start,end,length
        Py_ssize_t i
        cnp.ndarray[cnp.float64_t, ndim=1] pvals
        cnp.ndarray[cnp.float64_t, ndim=1] data_ceil 
        
    data_ceil = np.ceil(data)
    for i in range(indptr.shape[0] - 1):
        start, end = indptr[i], indptr[i+1]
        length = end - start

        # base p-values on integer data to avoid small numerical issues with 
        # float on sum
        counts_sum = data_ceil[start:end].sum()
        pvals = data_ceil[start:end] / counts_sum

        data[start:end] = rng.multinomial(n, pvals)


cdef _subsample_without_replacement(cnp.ndarray[cnp.float64_t, ndim=1] data,
                                    cnp.ndarray[cnp.int32_t, ndim=1] indptr,
                                    cnp.int64_t n,
                                    object rng):
    """Subsample non-zero values of a sparse array w/out replacement

    Note: this method operates in place

    Parameters
    ----------
    data : {csr_matrix, csc_matrix}.data
        A 1xM sparse vector data
    indptr : {csr_matrix, csc_matrix}.indptr
        A 1xM sparse vector indptr
    n : int
        Number of items to subsample from `arr`
    rng : Generator instance
        A random generator. This will likely be an instance returned 
        by np.random.default_rng
    """
    cdef:
        cnp.int64_t counts_sum, count_el, perm_count_el
        cnp.int64_t count_rem
        cnp.ndarray[cnp.int64_t, ndim=1] permuted, intdata
        Py_ssize_t i, idx
        cnp.int32_t length,el,start,end
        cnp.int64_t el_cnt

    for i in range(indptr.shape[0] - 1):
        start, end = indptr[i], indptr[i+1]
        length = end - start
        # We are relying on data being integers
        # If there are rounding erros, fp64 sums can lead to
        # big errors in sum, so convert to int64, first
        intdata = data[start:end].astype(np.int64)
        counts_sum = intdata.sum()
        
        if counts_sum < n:
            data[start:end] = 0
            continue

        permuted = rng.choice(counts_sum, n, replace=False, shuffle=False)
        permuted.sort()

        # now need to do reverse mapping
        # since I am not using np.repeat anymore
        # reminder, old logic was
        #   r = np.arange(length)
        #   unpacked = np.repeat(r, data_i[start:end])
        #   permuted_unpacked = rng.choice(unpacked, n, replace=False, shuffle=False)
        # 
        # specifically, what we're going to do here is randomly pick what elements within
        # each sample to keep. this is analogous issuing the prior np.repeat call, and obtaining
        # a random set of index positions for that resulting array. however, we do not need to 
        # perform the np.repeat call as we know the length of that resulting vector already,
        # and additionally, we can compute the sample associated with an index in that array
        # without constructing it.

        el = 0         # index in result/data
        count_el = 0  # index in permutted
        count_rem = intdata[0]  # since each data has multiple els, keep track how many are left
        el_cnt = 0
        for idx in range(n):
            perm_count_el = permuted[idx]
            # The array is sorted, so just jump ahead if needed
            # Move until we get withing the elements range
            while (perm_count_el - count_el) >= count_rem:
               #save the computed value
               data[start+el] = el_cnt
               # move to next element
               el += 1
               # move to the beginning of next element
               count_el += count_rem
               # Load how much we have avaialble
               count_rem = intdata[el]
               #re-start the el counter
               el_cnt = 0
            # increment the el counter
            el_cnt += 1
            # update the counters
            # reduce what is left
            count_rem -= (perm_count_el - count_el)
            #move the pointer to where we stopped
            count_el = perm_count_el
        # save the last value
        data[start+el] = el_cnt
        # clean up tail elements
        data[start+el+1:end] = 0


def subsample(arr, n, with_replacement, rng):
    """Subsample non-zero values of a sparse array

    Note: this method operates in place

    Parameters
    ----------
    arr : {csr_matrix, csc_matrix}
        A 1xM sparse vector
    n : int
        Number of items to subsample from `arr`
    with_replacement : bool
        Whether to permute or use multinomial sampling
    rng : Generator instance
        A random generator. This will likely be an instance returned 
        by np.random.default_rng

    Notes
    -----
    This code was adapted from scikit-bio (`skbio.math._subsample`)

    """
    if (with_replacement):
       _subsample_with_replacement(arr.data, arr.indptr, n, rng)
    else:
       _subsample_without_replacement(arr.data, arr.indptr, n, rng)