File: _cutils.pyx

package info (click to toggle)
python-skbio 0.6.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 11,924 kB
  • sloc: python: 67,527; ansic: 672; makefile: 225
file content (419 lines) | stat: -rw-r--r-- 12,446 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
# -----------------------------------------------------------------------------
#  Copyright (c) 2021-2021, scikit-bio development team.
#
#  Distributed under the terms of the Modified BSD License.
#
#  The full license is in the file LICENSE.txt, distributed with this software.
# -----------------------------------------------------------------------------

import numpy as np
cimport numpy as cnp
cnp.import_array()
cimport cython
from cython.parallel import prange
from libc.math cimport sqrt, fabs


ctypedef cnp.npy_intp intp_t

ctypedef fused TReal:
    float
    double

ctypedef fused floating:
    cnp.float64_t
    cnp.float32_t

ctypedef cnp.float64_t float64_t
ctypedef cnp.float32_t float32_t

@cython.boundscheck(False)
@cython.wraparound(False)
def is_symmetric_and_hollow_cy(TReal[:, ::1] mat):
    """
    Check if mat is symmetric and hollow.
    Equivalent to [not (mat.T != mat).any(), np.trace(mat) == 0]

    Parameters
    ----------
    mat : 2D array_like
        Distance matrix.

    Result:
    -------
    is_symmetric: Boolean
        not (mat.T != mat).any()
    is_hollow: Boolean
        np.trace(mat) == 0
    """
    cdef Py_ssize_t in_n = mat.shape[0]
    cdef Py_ssize_t in2 = mat.shape[1]

    assert in_n == in2

    cdef Py_ssize_t trow,tcol
    cdef Py_ssize_t trow_max,tcol_max
    cdef Py_ssize_t row,col

    cdef TReal testval

    # use int instead of bool for portability
    cdef int is_sym = True
    cdef int is_hollow = True

    # use a tiled approach to maximize memory locality
    for trow in prange(0, in_n, 24, nogil=True):
        trow_max = min(trow+24, in_n)
        for tcol in range(0, in_n, 24):
            tcol_max = min(tcol+24, in_n)
            for row in range(trow, trow_max, 1):
                for col in range(tcol, tcol_max, 1):
                   is_sym &= (mat[row,col]==mat[col,row])
            if (trow==tcol): # diagonal block, only ones that can have col==row
                for col in range(tcol, tcol_max, 1):
                   is_hollow &= (mat[col,col]==0)

    return [(is_sym==True), (is_hollow==True)]


@cython.boundscheck(False)
@cython.wraparound(False)
def distmat_reorder_cy(TReal[:, ::1] in_mat, intp_t[::1] reorder_vec,
                       TReal[:, ::1] out_mat):
    """
    Reorder the rows and columns of a distance matrix
    given a reorder vector.
    Not all of the columns need to be used.

    For example:
     [ [0, 1, 2, 3] ,
       [1, 0, 4, 5] ,
       [2, 4, 0, 6] ,
       [3, 5, 6, 0] ]
     with
     [1,0,3,2]
     will result in
     [ [0, 1, 5, 4] ,
       [1, 0, 3, 2] ,
       [5, 3, 0, 6] ,
       [4, 2, 6, 0] ]

    Note: No error checking is performed.
          The caller must ensure that all values in reorder_vec are valid

    Parameters
    ----------
    in_mat : 2D array_like
        Distance matrix.
    reorder_vec : 1D_array_like
        List of permutation indexes
    out_mat : 2D array_like
        Output, Distance matrix, must be same size as reorder_vec
    """
    cdef Py_ssize_t in_n = in_mat.shape[0]
    cdef Py_ssize_t in2 = in_mat.shape[1]
    cdef Py_ssize_t out_n = reorder_vec.shape[0]
    cdef Py_ssize_t on2 = out_mat.shape[0]
    cdef Py_ssize_t on3 = out_mat.shape[1]

    assert in_n == in2
    assert out_n == on2
    assert out_n == on3

    cdef Py_ssize_t row,col
    cdef Py_ssize_t vrow

    for row in prange(out_n, nogil=True):
        vrow = reorder_vec[row]
        for col in range(out_n):
           out_mat[row,col] = in_mat[vrow, reorder_vec[col]]

@cython.boundscheck(False)
@cython.wraparound(False)
def distmat_reorder_condensed_cy(TReal[:, ::1] in_mat, intp_t[::1] reorder_vec,
                                 TReal[::1] out_mat_condensed):
    """
    Reorder the rows and columns of a distance matrix
    given a reorder vector.
    Not all of the columns need to be used.

    For example:
     [ [0, 1, 2, 3] ,
       [1, 0, 4, 5] ,
       [2, 4, 0, 6] ,
       [3, 5, 6, 0] ]
     with
     [1,0,3,2]
     will result in
     [ 1, 5, 4, 3, 2, 6 ]

    Note: No error checking is performed.
          The caller must ensure that all values in reorder_vec are valid

    Parameters
    ----------
    in_mat : 2D array_like
        Distance matrix.
    reorder_vec : 1D_array_like
        List of permutation indexes
    out_mat_condensed : 1D array_like
        Output, condensed distance matrix
    """
    cdef Py_ssize_t in_n = in_mat.shape[0]
    cdef Py_ssize_t in2 = in_mat.shape[1]
    cdef Py_ssize_t out_n = reorder_vec.shape[0]
    cdef Py_ssize_t on2 = out_mat_condensed.shape[0]

    assert in_n == in2
    assert on2 == ((out_n-1)*out_n)/2

    cdef Py_ssize_t row,col
    cdef Py_ssize_t vrow
    cdef Py_ssize_t idx

    for row in prange(out_n-1, nogil=True):
        vrow = reorder_vec[row]
        idx = row*(out_n-1) - ((row-1)*row)//2
        for col in range(out_n-row-1):
           out_mat_condensed[idx+col] = in_mat[vrow, reorder_vec[col+row+1]]


@cython.boundscheck(False)
@cython.wraparound(False)
def mantel_perm_pearsonr_cy(TReal[:, ::1] x_data, intp_t[:, ::1] perm_order,
                            TReal xmean, TReal normxm,
                            TReal[::1] ym_normalized,
                            TReal[::1] permuted_stats):
    """
    Fused permute, fma, pearsonr for mantel.

    Replaces the following python code:
    def _mantel_perm_pearsonr_one(x_flat, xmean, normxm, ym_normalized):
        # inline pearsonr, condensed from scipy.stats.pearsonr
        # and reusing some of the known values
        xm_normalized = (x_flat - xmean)/normxm
        one_stat = np.dot(xm_normalized, ym_normalized)
        one_stat = max(min(one_stat, 1.0), -1.0)
        return one_stat

    perm_gen = (_mantel_perm_pearsonr_one(distmat_reorder_condensed(x._data, perm_order[p,:]),
                                          xmean, normxm, ym_normalized)
                for p in range(permutations))
    permuted_stats = np.fromiter(perm_gen, np.float, count=permutations)

    Parameters
    ----------
    x_data : 2D array_like
        Distance matrix.
    perm_order : 2D array_like
        List of permutation orders.
    xmean: real
        Mean value of condensed x_data
    normxm: real
        Norm of pre-processed xm
    ym_normalized : 1D_array_like
        Normalized condensed y_data
    permuted_stats : 1D array_like
        Output, Pearson stats
    """
    cdef Py_ssize_t in_n = x_data.shape[0]
    cdef Py_ssize_t in2 = x_data.shape[1]
    cdef Py_ssize_t perms_n = perm_order.shape[0]
    cdef Py_ssize_t out_n = perm_order.shape[1]
    cdef Py_ssize_t y_n = ym_normalized.shape[0]
    cdef Py_ssize_t on2 = permuted_stats.shape[0]

    assert in_n == in2
    assert y_n == ((out_n-1)*out_n)/2
    assert perms_n == on2

    cdef Py_ssize_t p
    cdef Py_ssize_t row,col,icol
    cdef Py_ssize_t vrow
    cdef Py_ssize_t idx

    cdef TReal mul = 1.0/normxm
    cdef TReal add = -xmean/normxm

    cdef TReal my_ps
    cdef TReal yval
    cdef TReal xval

    for p in prange(perms_n, nogil=True):
        my_ps = 0.0
        for row in range(out_n-1):
            vrow = perm_order[p, row]
            idx = row*(out_n-1) - ((row-1)*row)//2
            for icol in range(out_n-row-1):
               col = icol+row+1
               yval = ym_normalized[idx+icol]
               xval = x_data[vrow, perm_order[p, col]]*mul + add
               # do not use += to avoid having prange consider it for reduction
               my_ps = yval*xval + my_ps

        # Presumably, if abs(one_stat) > 1, then it is only some small artifact of
        # floating point arithmetic.
        if my_ps>1.0:
            my_ps = 1.0
        elif my_ps<-1.0:
            my_ps = -1.0
        permuted_stats[p] = my_ps


@cython.boundscheck(False)
@cython.wraparound(False)
def permanova_f_stat_sW_cy(TReal[:, ::1] distance_matrix,
                           Py_ssize_t[::1] group_sizes,
                           Py_ssize_t[::1] grouping):
    """Compute PERMANOVA pseudo-F partial statistic."""
    cdef Py_ssize_t in_n = distance_matrix.shape[0]
    cdef Py_ssize_t in2 = distance_matrix.shape[1]
    cdef Py_ssize_t in3 = grouping.shape[0]

    assert in_n == in2
    assert in_n == in3

    cdef double s_W = 0.0

    cdef Py_ssize_t group_idx
    cdef double local_s_W
    cdef double val

    cdef Py_ssize_t row, col, rowi, coli
    cdef Py_ssize_t in_n_2 = in_n//2

    for rowi in prange(in_n_2, nogil=True):
        # since columns get shorter, combine first and last
        row=rowi
        local_s_W = 0.0
        group_idx = grouping[row]
        for coli in range(in_n-row-1):
            col = coli+row+1
            if grouping[col] == group_idx:
                val = distance_matrix[row,col]
                local_s_W = local_s_W + val * val
        s_W += local_s_W/group_sizes[group_idx]

        row = in_n-rowi-2
        if row!=rowi: # don't double count
            local_s_W = 0.0
            group_idx = grouping[row]
            for coli in range(in_n-row-1):
                col = coli+row+1
                if grouping[col] == group_idx:
                    val = distance_matrix[row,col]
                    local_s_W = local_s_W + val * val
            s_W += local_s_W/group_sizes[group_idx]

    return s_W

@cython.boundscheck(False)
@cython.wraparound(False)
def geomedian_axis_one(floating[:, :] X, floating eps=1e-7,
                           size_t maxiters=500):
    """Compute high dimensional median.
    
    This function, its helpers (dist_euclidean, norm_euclidean, sum), and necessary
    type definitions (floating) were ported from hdmedians v0.14.2. The only change was
    changing "cdef" to "def" on the line defining the function. See
    https://github.com/daleroberts/hdmedians for more information.
    """
    cdef size_t p = X.shape[0]
    cdef size_t n = X.shape[1]

    cdef floating[:] y = np.mean(X, axis=1)

    if n == 1:
        return y

    if floating is cnp.float32_t:
        dtype = np.float32
    else:
        dtype = np.float64

    cdef floating[:] D = np.empty(n, dtype=dtype)
    cdef floating[:] Dinv = np.empty(n, dtype=dtype)
    cdef floating[:] W = np.empty(n, dtype=dtype)
    cdef floating[:] T = np.empty(p, dtype=dtype)
    cdef floating[:] y1 = np.empty(p, dtype=dtype)
    cdef floating[:] R = np.empty(p, dtype=dtype)

    cdef floating dist, Dinvs, total, r, rinv, tmp, Di
    cdef size_t nzeros = n
    cdef size_t iteration

    with nogil:
        iteration = 0
        while iteration < maxiters:

            for i in range(n):
                Di = _dist_euclidean(X[:, i], y)
                D[i] = Di
                if fabs(Di) > eps:
                    Dinv[i] = 1. / Di
                else:
                    Dinv[i] = 0.

            Dinvs = _sum(Dinv)

            for i in range(n):
                W[i] = Dinv[i] / Dinvs

            for j in range(p):
                total = 0.
                for i in range(n):
                    if fabs(D[i]) > eps:
                        total += W[i] * X[j, i]
                T[j] = total

            nzeros = n
            for i in range(n):
                if fabs(D[i]) > eps:
                    nzeros -= 1

            if nzeros == 0:
                y1 = T
            elif nzeros == n:
                break
            else:
                for j in range(p):
                    R[j] = (T[j] - y[j]) * Dinvs
                r = _norm_euclidean(R)
                if r > eps:
                    rinv = nzeros/r
                else:
                    rinv = 0.
                for j in range(p):
                    y1[j] = max(0, 1-rinv)*T[j] + min(1, rinv)*y[j]

            dist = _dist_euclidean(y, y1)
            if dist < eps:
               break

            y[:] = y1
            iteration = iteration + 1
            
    return y

cdef floating _dist_euclidean(floating[:] x, floating[:] y) nogil:
    cdef size_t n = x.shape[0]
    cdef float64_t d = 0.
    cdef float64_t tmp
    for i in range(n):
        tmp = x[i] - y[i]
        d += tmp * tmp
    return <floating>sqrt(d)

cdef floating _norm_euclidean(floating[:] x) nogil:
    cdef size_t n = x.shape[0]
    cdef float64_t d = 0.
    for i in range(n):
        d += x[i] * x[i]
    return <floating>sqrt(d)

cdef floating _sum(floating[:] x) nogil:
    cdef size_t n = x.shape[0]
    cdef float64_t total = 0.
    for i in range(n):
        total += x[i]
    return <floating>total