1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
|
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int64
ctypedef np.int64_t DTYPE_t
@cython.boundscheck(False)
@cython.wraparound(False)
def _tip_distances(np.ndarray[np.double_t, ndim=1] a, object t,
np.ndarray[DTYPE_t, ndim=1] tip_indices):
"""Sets each tip to its distance from the root
Parameters
----------
a : np.ndarray of double
A matrix in which each row corresponds to a node in ``t``.
t : skbio.tree.TreeNode
The tree that corresponds to the rows in ``a``.
tip_indices : np.ndarray of int
The index positions in ``a`` of the tips in ``t``.
Returns
-------
np.ndarray of double
A matrix in which each row corresponds to a node in ``t``, Only the
rows that correspond to tips are nonzero, and the values in these rows
are the distance from that tip to the root of the tree.
"""
cdef:
object n
Py_ssize_t i, p_i, n_rows
np.ndarray[np.double_t, ndim=1] mask
np.ndarray[np.double_t, ndim=1] tip_ds = a.copy()
# preorder reduction over the tree to gather distances at the tips
n_rows = tip_ds.shape[0]
for n in t.preorder(include_self=False):
i = n.id
p_i = n.parent.id
tip_ds[i] += tip_ds[p_i]
# construct a mask that represents the locations of the tips
mask = np.zeros(n_rows, dtype=np.double)
for i in range(tip_indices.shape[0]):
mask[tip_indices[i]] = 1.0
# apply the mask such that tip_ds only includes values which correspond to
# the tips of the tree.
for i in range(n_rows):
tip_ds[i] *= mask[i]
return tip_ds
@cython.boundscheck(False)
@cython.wraparound(False)
cdef _traverse_reduce(np.ndarray[DTYPE_t, ndim=2] child_index,
np.ndarray[DTYPE_t, ndim=2] a):
"""Apply a[k] = sum[i:j]
Parameters
----------
child_index: np.array of int
A matrix in which the first column corresponds to an index position in
``a``, which represents a node in a tree. The second column is the
starting index in ``a`` for the node's children, and the third column
is the ending index in ``a`` for the node's children.
a : np.ndarray of int
A matrix of the environment data. Each row corresponds to a node in a
tree, and each column corresponds to an environment. On input, it is
assumed that only tips have counts.
Notes
-----
This is effectively a postorder reduction over the tree. For example,
given the following tree:
/-A
/E-------|
| \-B
-root----|
| /-C
\F-------|
\-D
And assuming counts for [A, B, C, D] in environment FOO of [1, 1, 1, 0] and
counts for environment BAR of [0, 1, 1, 1], the input counts matrix ``a``
would be:
[1 0 -> A
1 1 -> B
1 1 -> C
0 1 -> D
0 0 -> E
0 0 -> F
0 0] -> root
The method will perform the following reduction:
[1 0 [1 0 [1 0 [1 0
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
0 1 -> 0 1 -> 0 1 -> 0 1
0 0 2 1 2 1 2 1
0 0 0 0 1 2 1 2
0 0] 0 0] 0 0] 3 3]
The index positions of the above are encoded in ``child_index`` which
describes the node to aggregate into, and the start and stop index
positions of the nodes immediate descendents.
This method operates inplace on ``a``
"""
cdef:
Py_ssize_t i, j, k
DTYPE_t node, start, end
DTYPE_t n_envs = a.shape[1]
# possible GPGPU target
for i in range(child_index.shape[0]):
node = child_index[i, 0]
start = child_index[i, 1]
end = child_index[i, 2]
for j in range(start, end + 1):
for k in range(n_envs):
a[node, k] += a[j, k]
@cython.boundscheck(False)
@cython.wraparound(False)
def _nodes_by_counts(np.ndarray counts,
np.ndarray tip_ids,
dict indexed):
"""Construct the count array, and the counts up the tree
Parameters
----------
counts : np.array of int
A 1D or 2D vector in which each row corresponds to the observed counts
in an environment. The rows are expected to be in order with respect to
`tip_ids`.
tip_ids : np.array of str
A vector of tip names that correspond to the columns in the `counts`
matrix.
indexed : dict
The result of `index_tree`.
Returns
-------
np.array of int
The observed counts of every node and the counts if its descendents.
"""
cdef:
np.ndarray nodes, observed_ids
np.ndarray[DTYPE_t, ndim=2] count_array, counts_t
np.ndarray[DTYPE_t, ndim=1] observed_indices, otus_in_nodes
Py_ssize_t i, j
set observed_ids_set
object n
dict node_lookup
DTYPE_t n_count_vectors, n_count_otus
nodes = indexed['name']
# allow counts to be a vector
counts = np.atleast_2d(counts)
counts = counts.astype(DTYPE, copy=False)
# determine observed IDs. It may be possible to unroll these calls to
# squeeze a little more performance
observed_indices = counts.sum(0).nonzero()[0]
observed_ids = tip_ids[observed_indices]
observed_ids_set = set(observed_ids)
# construct mappings of the observed to their positions in the node array
node_lookup = {}
for i in range(nodes.shape[0]):
n = nodes[i]
if n in observed_ids_set:
node_lookup[n] = i
# determine the positions of the observed IDs in nodes
otus_in_nodes = np.zeros(observed_ids.shape[0], dtype=DTYPE)
for i in range(observed_ids.shape[0]):
n = observed_ids[i]
otus_in_nodes[i] = node_lookup[n]
# count_array has a row per node (not tip) and a column per env.
n_count_vectors = counts.shape[0]
count_array = np.zeros((nodes.shape[0], n_count_vectors), dtype=DTYPE)
# populate the counts array with the counts of each observation in each
# env
counts_t = counts.transpose()
n_count_otus = otus_in_nodes.shape[0]
for i in range(n_count_otus):
for j in range(n_count_vectors):
count_array[otus_in_nodes[i], j] = counts_t[observed_indices[i], j]
child_index = indexed['child_index'].astype(DTYPE, copy=False)
_traverse_reduce(child_index, count_array)
return count_array
|