File: _util.py

package info (click to toggle)
python-skbio 0.6.2-4
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 9,312 kB
  • sloc: python: 60,482; ansic: 672; makefile: 224
file content (284 lines) | stat: -rw-r--r-- 9,375 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------

import collections.abc

import numpy as np
import pandas as pd

from skbio.tree import DuplicateNodeError, MissingNodeError
from skbio.diversity._phylogenetic import _nodes_by_counts


def _validate_counts_vector(counts, cast_int=False):
    """Validate and convert input to an acceptable counts vector type.

    Parameters
    ----------
    counts : array_like of int or float of shape (n_taxa,)
        Vector of counts.
    cast_int : bool, optional
        Cast values into integers, if not already. ``False`` by default.

    Returns
    -------
    ndarray of int or float of shape (n_taxa,)
        Valid counts vector.

    Raises
    ------
    ValueError
        If input array has an invalid data type.
    ValueError
        If input array is not 1-D.
    ValueError
        If there are negative values.

    Notes
    -----
    This function will return the original ``counts`` if it is already a valid counts
    vector. Otherwise it will return an edited copy that is valid.

    The data type of counts must be any subtype of ``np.integer`` (integers) or
    ``np.floating`` (floating-point numbers; excluding complex numbers) [1]_.

    See Also
    --------
    _validate_counts_matrix

    References
    ----------
    .. [1] https://numpy.org/doc/stable/reference/arrays.scalars.html

    """
    counts = np.asarray(counts)

    # counts must be int or float
    if np.issubdtype(dtype := counts.dtype, np.floating):
        # cast values into integers
        if cast_int:
            counts = counts.astype(int)

    elif not np.issubdtype(dtype, np.integer) and dtype is not np.dtype("bool"):
        raise ValueError("Counts must be integers or floating-point numbers.")

    if counts.ndim != 1:
        raise ValueError("Only 1-D vectors are supported.")

    if (counts < 0).any():
        raise ValueError("Counts vector cannot contain negative values.")

    return counts


def _validate_counts_matrix(counts, ids=None, cast_int=False):
    """Validate and convert input to an acceptable counts matrix type.

    Parameters
    ----------
    counts : array_like of shape (n_samples, n_taxa)
        Matrix of counts.
    ids : array_like of shape (n_samples,), optional
        Sample IDs to check against counts dimensions.
    cast_int : bool, optional
        Cast values into integers, if not already. ``False`` by default.

    Returns
    -------
    ndarray of shape (n_samples, n_taxa)
        Valid counts matrix.

    See Also
    --------
    _validate_counts_vector

    """
    lenerr = "Number of rows in `counts` must be equal to number of provided `ids`."

    # handle pandas data frame
    if isinstance(counts, pd.DataFrame):
        if ids is not None and counts.shape[0] != len(ids):
            raise ValueError(lenerr)
        counts = counts.to_numpy()

    else:
        # convert counts into a 2-D array
        # will raise ValueError if row lengths are unequal
        counts = np.atleast_2d(counts)

        if counts.ndim > 2:
            raise ValueError(
                "Only 1-D and 2-D array-like objects can be provided as input. "
                f"Provided object has {counts.ndim} dimensions."
            )

        if ids is not None and counts.shape[0] != len(ids):
            raise ValueError(lenerr)

    # counts must be int or float
    if np.issubdtype(dtype := counts.dtype, np.floating):
        # cast values into integers
        if cast_int:
            counts = counts.astype(int)

    elif not np.issubdtype(dtype, np.integer) and dtype is not np.dtype("bool"):
        raise ValueError("Counts must be integers or floating-point numbers.")

    # negative values are not allowed
    # TODO: `counts < 0` creates a Boolean array of the same shape, which could be
    # memory-inefficient if the input array is very large. Should optimize.
    # See: https://stackoverflow.com/questions/75553212/
    if (counts < 0).any():
        raise ValueError("Counts cannot contain negative values.")

    return counts


def _validate_taxa_and_tree(counts, taxa, tree, rooted=True):
    """Validate taxa and tree prior to calculating phylogenetic diversity metrics."""
    len_taxa = len(taxa)
    set_taxa = set(taxa)
    if len_taxa != len(set_taxa):
        raise ValueError("``taxa`` cannot contain duplicated ids.")

    if len(counts) != len_taxa:
        raise ValueError("``taxa`` must be the same length as ``counts`` " "vector(s).")

    if len(tree.root().children) == 0:
        raise ValueError("``tree`` must contain more than just a root node.")

    if rooted is True and len(tree.root().children) > 2:
        # this is an imperfect check for whether the tree is rooted or not.
        # can this be improved?
        raise ValueError("``tree`` must be rooted.")

    # all nodes (except the root node) have corresponding branch lengths
    # all tip names in tree are unique
    # all taxa correspond to tip names in tree
    branch_lengths = []
    tip_names = []
    for e in tree.traverse():
        if not e.is_root():
            branch_lengths.append(e.length)
        if e.is_tip():
            tip_names.append(e.name)
    set_tip_names = set(tip_names)
    if len(tip_names) != len(set_tip_names):
        raise DuplicateNodeError("All tip names must be unique.")

    if np.array([branch is None for branch in branch_lengths]).any():
        raise ValueError("All non-root nodes in ``tree`` must have a branch " "length.")
    missing_tip_names = set_taxa - set_tip_names
    if missing_tip_names != set():
        n_missing_tip_names = len(missing_tip_names)
        raise MissingNodeError(
            "All ``taxa`` must be present as tip names "
            "in ``tree``. ``taxa`` not corresponding to "
            "tip names (n=%d): %s" % (n_missing_tip_names, " ".join(missing_tip_names))
        )


def _vectorize_counts_and_tree(counts, taxa, tree):
    """Index tree and convert counts to np.array in corresponding order.

    Parameters
    ----------
    counts : array_like of shape (n_samples, n_taxa) or (n_taxa,)
        Counts/abundances of taxa in one or multiple samples.
    taxa : array_like of shape (n_taxa,)
        Taxon IDs corresponding to tip names in `tree`.
    tree : skbio.TreeNode
        Tree relating taxa. The set of tip names in the tree can be a superset
        of `taxa`, but not a subset.

    Returns
    -------
    ndarray of shape (n_samples, n_nodes)
        Total counts/abundances of taxa descending from individual nodes of the tree.
    dict of array
        Indexed tree. See `to_array`.
    ndarray of shape (n_nodes,)
        Branch lengths of corresponding nodes of the tree.

    See Also
    --------
    skbio.tree.TreeNode.to_array

    """
    tree_index = tree.to_array(nan_length_value=0.0)
    taxa = np.asarray(taxa)
    counts = np.atleast_2d(counts)
    counts_by_node = _nodes_by_counts(counts, taxa, tree_index)
    branch_lengths = tree_index["length"]

    # branch_lengths is just a reference to the array inside of tree_index,
    # but it's used so much that it's convenient to just pull it out here.
    return counts_by_node.T, tree_index, branch_lengths


def _get_phylogenetic_kwargs(counts, **kwargs):
    try:
        taxa = kwargs.pop("taxa")
    except KeyError:
        raise ValueError("``taxa`` is required for phylogenetic diversity " "metrics.")
    try:
        tree = kwargs.pop("tree")
    except KeyError:
        raise ValueError("``tree`` is required for phylogenetic diversity " "metrics.")

    return taxa, tree, kwargs


def _quantitative_to_qualitative_counts(counts):
    return counts > 0.0


def _check_taxa_alias(taxa, tree, otu_ids):
    # make `taxa` an alias of `taxa`; for backward compatibility
    if taxa is None:
        if otu_ids is None:
            raise ValueError("A list of taxon IDs must be provided.")
        taxa = otu_ids
    if tree is None:
        raise ValueError("A phylogenetic tree must be provided.")
    return taxa


def _table_to_numpy(table):
    """Convert a skbio.table.Table to a dense representation.

    This is a stop-gap solution to allow current Table objects to interoperate
    with existing driver methods, until they transition to be "sparse" aware.
    """
    sample_ids = list(table.ids())
    obs_ids = list(table.ids(axis="observation"))

    if table.is_empty():
        counts = np.array([[]] * len(sample_ids))
    else:
        counts = table.matrix_data.T.toarray()

    return counts, sample_ids, obs_ids


def _validate_table(counts, ids, kwargs):
    """Disallow overriding of sample and feature IDs.

    WARNING: this implicitly adds an entry to kwargs IF `tree` is present.
    """
    if ids is not None:
        raise ValueError("Cannot provide a `Table` as `counts` and `ids`")

    if "taxa" in kwargs:
        raise ValueError("Cannot provide a `Table` as `counts` and `taxa`")

    dense_counts, sample_ids, feature_ids = _table_to_numpy(counts)
    if "tree" in kwargs:
        kwargs["taxa"] = feature_ids

    return dense_counts, sample_ids