1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
|
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------
import collections.abc
import numpy as np
import pandas as pd
from skbio.tree import DuplicateNodeError, MissingNodeError
from skbio.diversity._phylogenetic import _nodes_by_counts
def _validate_counts_vector(counts, cast_int=False):
"""Validate and convert input to an acceptable counts vector type.
Parameters
----------
counts : array_like of int or float of shape (n_taxa,)
Vector of counts.
cast_int : bool, optional
Cast values into integers, if not already. ``False`` by default.
Returns
-------
ndarray of int or float of shape (n_taxa,)
Valid counts vector.
Raises
------
ValueError
If input array has an invalid data type.
ValueError
If input array is not 1-D.
ValueError
If there are negative values.
Notes
-----
This function will return the original ``counts`` if it is already a valid counts
vector. Otherwise it will return an edited copy that is valid.
The data type of counts must be any subtype of ``np.integer`` (integers) or
``np.floating`` (floating-point numbers; excluding complex numbers) [1]_.
See Also
--------
_validate_counts_matrix
References
----------
.. [1] https://numpy.org/doc/stable/reference/arrays.scalars.html
"""
counts = np.asarray(counts)
# counts must be int or float
if np.issubdtype(dtype := counts.dtype, np.floating):
# cast values into integers
if cast_int:
counts = counts.astype(int)
elif not np.issubdtype(dtype, np.integer) and dtype is not np.dtype("bool"):
raise ValueError("Counts must be integers or floating-point numbers.")
if counts.ndim != 1:
raise ValueError("Only 1-D vectors are supported.")
if (counts < 0).any():
raise ValueError("Counts vector cannot contain negative values.")
return counts
def _validate_counts_matrix(counts, ids=None, cast_int=False):
"""Validate and convert input to an acceptable counts matrix type.
Parameters
----------
counts : array_like of shape (n_samples, n_taxa)
Matrix of counts.
ids : array_like of shape (n_samples,), optional
Sample IDs to check against counts dimensions.
cast_int : bool, optional
Cast values into integers, if not already. ``False`` by default.
Returns
-------
ndarray of shape (n_samples, n_taxa)
Valid counts matrix.
See Also
--------
_validate_counts_vector
"""
lenerr = "Number of rows in `counts` must be equal to number of provided `ids`."
# handle pandas data frame
if isinstance(counts, pd.DataFrame):
if ids is not None and counts.shape[0] != len(ids):
raise ValueError(lenerr)
counts = counts.to_numpy()
else:
# convert counts into a 2-D array
# will raise ValueError if row lengths are unequal
counts = np.atleast_2d(counts)
if counts.ndim > 2:
raise ValueError(
"Only 1-D and 2-D array-like objects can be provided as input. "
f"Provided object has {counts.ndim} dimensions."
)
if ids is not None and counts.shape[0] != len(ids):
raise ValueError(lenerr)
# counts must be int or float
if np.issubdtype(dtype := counts.dtype, np.floating):
# cast values into integers
if cast_int:
counts = counts.astype(int)
elif not np.issubdtype(dtype, np.integer) and dtype is not np.dtype("bool"):
raise ValueError("Counts must be integers or floating-point numbers.")
# negative values are not allowed
# TODO: `counts < 0` creates a Boolean array of the same shape, which could be
# memory-inefficient if the input array is very large. Should optimize.
# See: https://stackoverflow.com/questions/75553212/
if (counts < 0).any():
raise ValueError("Counts cannot contain negative values.")
return counts
def _validate_taxa_and_tree(counts, taxa, tree, rooted=True):
"""Validate taxa and tree prior to calculating phylogenetic diversity metrics."""
len_taxa = len(taxa)
set_taxa = set(taxa)
if len_taxa != len(set_taxa):
raise ValueError("``taxa`` cannot contain duplicated ids.")
if len(counts) != len_taxa:
raise ValueError("``taxa`` must be the same length as ``counts`` " "vector(s).")
if len(tree.root().children) == 0:
raise ValueError("``tree`` must contain more than just a root node.")
if rooted is True and len(tree.root().children) > 2:
# this is an imperfect check for whether the tree is rooted or not.
# can this be improved?
raise ValueError("``tree`` must be rooted.")
# all nodes (except the root node) have corresponding branch lengths
# all tip names in tree are unique
# all taxa correspond to tip names in tree
branch_lengths = []
tip_names = []
for e in tree.traverse():
if not e.is_root():
branch_lengths.append(e.length)
if e.is_tip():
tip_names.append(e.name)
set_tip_names = set(tip_names)
if len(tip_names) != len(set_tip_names):
raise DuplicateNodeError("All tip names must be unique.")
if np.array([branch is None for branch in branch_lengths]).any():
raise ValueError("All non-root nodes in ``tree`` must have a branch " "length.")
missing_tip_names = set_taxa - set_tip_names
if missing_tip_names != set():
n_missing_tip_names = len(missing_tip_names)
raise MissingNodeError(
"All ``taxa`` must be present as tip names "
"in ``tree``. ``taxa`` not corresponding to "
"tip names (n=%d): %s" % (n_missing_tip_names, " ".join(missing_tip_names))
)
def _vectorize_counts_and_tree(counts, taxa, tree):
"""Index tree and convert counts to np.array in corresponding order.
Parameters
----------
counts : array_like of shape (n_samples, n_taxa) or (n_taxa,)
Counts/abundances of taxa in one or multiple samples.
taxa : array_like of shape (n_taxa,)
Taxon IDs corresponding to tip names in `tree`.
tree : skbio.TreeNode
Tree relating taxa. The set of tip names in the tree can be a superset
of `taxa`, but not a subset.
Returns
-------
ndarray of shape (n_samples, n_nodes)
Total counts/abundances of taxa descending from individual nodes of the tree.
dict of array
Indexed tree. See `to_array`.
ndarray of shape (n_nodes,)
Branch lengths of corresponding nodes of the tree.
See Also
--------
skbio.tree.TreeNode.to_array
"""
tree_index = tree.to_array(nan_length_value=0.0)
taxa = np.asarray(taxa)
counts = np.atleast_2d(counts)
counts_by_node = _nodes_by_counts(counts, taxa, tree_index)
branch_lengths = tree_index["length"]
# branch_lengths is just a reference to the array inside of tree_index,
# but it's used so much that it's convenient to just pull it out here.
return counts_by_node.T, tree_index, branch_lengths
def _get_phylogenetic_kwargs(counts, **kwargs):
try:
taxa = kwargs.pop("taxa")
except KeyError:
raise ValueError("``taxa`` is required for phylogenetic diversity " "metrics.")
try:
tree = kwargs.pop("tree")
except KeyError:
raise ValueError("``tree`` is required for phylogenetic diversity " "metrics.")
return taxa, tree, kwargs
def _quantitative_to_qualitative_counts(counts):
return counts > 0.0
def _check_taxa_alias(taxa, tree, otu_ids):
# make `taxa` an alias of `taxa`; for backward compatibility
if taxa is None:
if otu_ids is None:
raise ValueError("A list of taxon IDs must be provided.")
taxa = otu_ids
if tree is None:
raise ValueError("A phylogenetic tree must be provided.")
return taxa
def _table_to_numpy(table):
"""Convert a skbio.table.Table to a dense representation.
This is a stop-gap solution to allow current Table objects to interoperate
with existing driver methods, until they transition to be "sparse" aware.
"""
sample_ids = list(table.ids())
obs_ids = list(table.ids(axis="observation"))
if table.is_empty():
counts = np.array([[]] * len(sample_ids))
else:
counts = table.matrix_data.T.toarray()
return counts, sample_ids, obs_ids
def _validate_table(counts, ids, kwargs):
"""Disallow overriding of sample and feature IDs.
WARNING: this implicitly adds an entry to kwargs IF `tree` is present.
"""
if ids is not None:
raise ValueError("Cannot provide a `Table` as `counts` and `ids`")
if "taxa" in kwargs:
raise ValueError("Cannot provide a `Table` as `counts` and `taxa`")
dense_counts, sample_ids, feature_ids = _table_to_numpy(counts)
if "tree" in kwargs:
kwargs["taxa"] = feature_ids
return dense_counts, sample_ids
|