1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
# -----------------------------------------------------------------------------
# Copyright (c) 2011-2017, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# -----------------------------------------------------------------------------
from __future__ import division, print_function
from itertools import compress
from collections import Iterable
from types import FunctionType
import numpy as np
cimport numpy as cnp
cdef cnp.ndarray[cnp.uint8_t, ndim=1] \
_make_filter_array_general(arr,
ids,
metadata,
func,
axis,
cnp.uint8_t invert):
"""Faster version of
[func(vals_i, id_i, md_i) ^ invert for
(vals_i, id_i, md_i) in zip(ids, metadata, rows/cols)]
"""
cdef:
Py_ssize_t i, j, n = arr.shape[::-1][axis]
cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data, \
row_or_col = np.zeros(n)
cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr, \
indices = arr.indices
cnp.ndarray[cnp.uint8_t, ndim=1] bools = \
np.empty(len(ids), dtype=np.uint8)
cnp.int32_t start, end
for i in range(len(ids)):
start, end = indptr[i], indptr[i+1]
# The following loop should be equivalent to
# row_or_col = np.zeros(n)
# row_or_col.put(indices[start:end], data[start:end])
for j in range(n):
if start >= end or j < indices[start]:
row_or_col[j] = 0
elif j == indices[start]:
row_or_col[j] = data[start]
start += 1
# After converting the output of the filtering function to a
# bool, we XOR it with invert (if invert is false it doesn't
# modify the function output, if it's true it inverts it).
bools[i] = bool(func(row_or_col, ids[i], metadata[i])) ^ invert
return bools
cdef _remove_rows_csr(arr, cnp.ndarray[cnp.uint8_t, ndim=1] booleans):
"""Sparse equivalent of arr[booleans] for a dense array.
"""
cdef Py_ssize_t m, n, row, j, offset, offset_rows, nnz
cdef cnp.int32_t start, end
cdef cnp.ndarray[cnp.int32_t, ndim=1] indptr = arr.indptr
cdef cnp.ndarray[cnp.int32_t, ndim=1] indices = arr.indices
cdef cnp.ndarray[cnp.float64_t, ndim=1] data = arr.data
m, n = arr.shape
offset_rows = 0
offset = 0
nnz = 0
for row in range(m):
start, end = indptr[row], indptr[row+1]
if booleans[row]:
indptr[row-offset_rows] = nnz
nnz += end - start
indptr[row-offset_rows + 1] = nnz
for j in range(start, end):
data[j-offset] = data[j]
indices[j-offset] = indices[j]
else:
offset += end - start
offset_rows += 1
arr.data = data[:nnz]
arr.indices = indices[:nnz]
arr.indptr = indptr[:m-offset_rows+1]
arr._shape = (m - offset_rows, n)
def _filter(arr, ids, metadata, index, ids_to_keep, axis, invert):
"""Filter row/columns of a sparse matrix according to the output of a
boolean function.
Parameters
----------
arr : sparse matrix
ids : 1D array_like
metadata : 1D array_like
index : dict
Maps id to index
ids_to_keep : function or iterable
axis : int
invert : bool
Returns
-------
arr : sparse matrix
ids : 1D ndarray of dtype object
metadata : tuple
"""
invert = bool(invert)
metadata_is_None = metadata is None
# General version (i.e., filter functions accepts values, ids and
# metadata) requires CSR for axis 0 and CSC for axis 1.
if axis == 0:
arr = arr.tocsr()
elif axis == 1:
arr = arr.tocsc()
fmt = arr.getformat()
cdef cnp.ndarray[cnp.uint8_t, ndim=1] bools
if metadata_is_None:
metadata = (None,) * len(ids)
if isinstance(ids_to_keep, Iterable):
idx = [index[id_] for id_ in ids_to_keep]
ids_to_keep = np.zeros(len(ids), dtype=bool)
ids_to_keep.put(idx, True)
bools = np.bitwise_xor(ids_to_keep, invert).view(np.uint8)
elif isinstance(ids_to_keep, FunctionType):
bools = _make_filter_array_general(arr, ids, metadata, ids_to_keep,
axis, invert)
else:
raise TypeError("ids_to_keep must be an iterable or a function")
if axis == 0:
_remove_rows_csr(arr, bools)
elif axis == 1:
arr = arr.T # arr was CSC, CSR after transposing
_remove_rows_csr(arr, bools)
arr = arr.T # Back to CSC
ids = np.asarray(list(compress(ids, bools)), dtype=object)
metadata = tuple(compress(metadata, bools))
if metadata_is_None:
metadata = None
return arr, ids, metadata
|