File: _phylogenetic.pyx

package info (click to toggle)
python-skbio 0.5.8-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 13,224 kB
  • sloc: python: 47,839; ansic: 672; makefile: 210; javascript: 50; sh: 19
file content (216 lines) | stat: -rw-r--r-- 7,018 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------

import numpy as np
cimport numpy as np
cimport cython

DTYPE = np.int64
ctypedef np.int64_t DTYPE_t


@cython.boundscheck(False)
@cython.wraparound(False)
def _tip_distances(np.ndarray[np.double_t, ndim=1] a, object t,
                   np.ndarray[DTYPE_t, ndim=1] tip_indices):
    """Sets each tip to its distance from the root

    Parameters
    ----------
    a : np.ndarray of double
        A matrix in which each row corresponds to a node in ``t``.
    t : skbio.tree.TreeNode
        The tree that corresponds to the rows in ``a``.
    tip_indices : np.ndarray of int
        The index positions in ``a`` of the tips in ``t``.

    Returns
    -------
    np.ndarray of double
        A matrix in which each row corresponds to a node in ``t``, Only the
        rows that correspond to tips are nonzero, and the values in these rows
        are the distance from that tip to the root of the tree.
    """
    cdef:
        object n
        Py_ssize_t i, p_i, n_rows
        np.ndarray[np.double_t, ndim=1] mask
        np.ndarray[np.double_t, ndim=1] tip_ds = a.copy()

    # preorder reduction over the tree to gather distances at the tips
    n_rows = tip_ds.shape[0]
    for n in t.preorder(include_self=False):
        i = n.id
        p_i = n.parent.id

        tip_ds[i] += tip_ds[p_i]

    # construct a mask that represents the locations of the tips
    mask = np.zeros(n_rows, dtype=np.double)
    for i in range(tip_indices.shape[0]):
        mask[tip_indices[i]] = 1.0

    # apply the mask such that tip_ds only includes values which correspond to
    # the tips of the tree.
    for i in range(n_rows):
        tip_ds[i] *= mask[i]

    return tip_ds


@cython.boundscheck(False)
@cython.wraparound(False)
cdef _traverse_reduce(np.ndarray[DTYPE_t, ndim=2] child_index,
                      np.ndarray[DTYPE_t, ndim=2] a):
    """Apply a[k] = sum[i:j]

    Parameters
    ----------
    child_index: np.array of int
        A matrix in which the first column corresponds to an index position in
        ``a``, which represents a node in a tree. The second column is the
        starting index in ``a`` for the node's children, and the third column
        is the ending index in ``a`` for the node's children.
    a : np.ndarray of int
        A matrix of the environment data. Each row corresponds to a node in a
        tree, and each column corresponds to an environment. On input, it is
        assumed that only tips have counts.

    Notes
    -----
    This is effectively a postorder reduction over the tree. For example,
    given the following tree:

                            /-A
                  /E-------|
                 |          \-B
        -root----|
                 |          /-C
                  \F-------|
                            \-D

    And assuming counts for [A, B, C, D] in environment FOO of [1, 1, 1, 0] and
    counts for environment BAR of [0, 1, 1, 1], the input counts matrix ``a``
    would be:

        [1 0  -> A
         1 1  -> B
         1 1  -> C
         0 1  -> D
         0 0  -> E
         0 0  -> F
         0 0] -> root

    The method will perform the following reduction:

        [1 0     [1 0     [1 0     [1 0
         1 1      1 1      1 1      1 1
         1 1      1 1      1 1      1 1
         0 1  ->  0 1  ->  0 1  ->  0 1
         0 0      2 1      2 1      2 1
         0 0      0 0      1 2      1 2
         0 0]     0 0]     0 0]     3 3]

    The index positions of the above are encoded in ``child_index`` which
    describes the node to aggregate into, and the start and stop index
    positions of the nodes immediate descendents.

    This method operates inplace on ``a``
    """
    cdef:
        Py_ssize_t i, j, k
        DTYPE_t node, start, end
        DTYPE_t n_envs = a.shape[1]

    # possible GPGPU target
    for i in range(child_index.shape[0]):
        node = child_index[i, 0]
        start = child_index[i, 1]
        end = child_index[i, 2]

        for j in range(start, end + 1):
            for k in range(n_envs):
                a[node, k] += a[j, k]


@cython.boundscheck(False)
@cython.wraparound(False)
def _nodes_by_counts(np.ndarray counts,
                     np.ndarray tip_ids,
                     dict indexed):
    """Construct the count array, and the counts up the tree

    Parameters
    ----------
    counts : np.array of int
        A 1D or 2D vector in which each row corresponds to the observed counts
        in an environment. The rows are expected to be in order with respect to
        `tip_ids`.
    tip_ids : np.array of str
        A vector of tip names that correspond to the columns in the `counts`
        matrix.
    indexed : dict
        The result of `index_tree`.

    Returns
    -------
    np.array of int
        The observed counts of every node and the counts if its descendents.

    """
    cdef:
        np.ndarray nodes, observed_ids
        np.ndarray[DTYPE_t, ndim=2] count_array, counts_t
        np.ndarray[DTYPE_t, ndim=1] observed_indices, otus_in_nodes
        Py_ssize_t i, j
        set observed_ids_set
        object n
        dict node_lookup
        DTYPE_t n_count_vectors, n_count_otus

    nodes = indexed['name']

    # allow counts to be a vector
    counts = np.atleast_2d(counts)
    counts = counts.astype(DTYPE, copy=False)

    # determine observed IDs. It may be possible to unroll these calls to
    # squeeze a little more performance
    observed_indices = counts.sum(0).nonzero()[0]
    observed_ids = tip_ids[observed_indices]
    observed_ids_set = set(observed_ids)

    # construct mappings of the observed to their positions in the node array
    node_lookup = {}
    for i in range(nodes.shape[0]):
        n = nodes[i]
        if n in observed_ids_set:
            node_lookup[n] = i

    # determine the positions of the observed IDs in nodes
    otus_in_nodes = np.zeros(observed_ids.shape[0], dtype=DTYPE)
    for i in range(observed_ids.shape[0]):
        n = observed_ids[i]
        otus_in_nodes[i] = node_lookup[n]

    # count_array has a row per node (not tip) and a column per env.
    n_count_vectors = counts.shape[0]
    count_array = np.zeros((nodes.shape[0], n_count_vectors), dtype=DTYPE)

    # populate the counts array with the counts of each observation in each
    # env
    counts_t = counts.transpose()
    n_count_otus = otus_in_nodes.shape[0]
    for i in range(n_count_otus):
        for j in range(n_count_vectors):
            count_array[otus_in_nodes[i], j] = counts_t[observed_indices[i], j]

    child_index = indexed['child_index'].astype(DTYPE, copy=False)
    _traverse_reduce(child_index, count_array)

    return count_array