File: _util.py

package info (click to toggle)
python-skbio 0.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 16,556 kB
  • ctags: 7,222
  • sloc: python: 42,085; ansic: 670; makefile: 180; sh: 10
file content (135 lines) | stat: -rw-r--r-- 5,022 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import collections

import numpy as np

from skbio.tree import DuplicateNodeError, MissingNodeError
from skbio.diversity._phylogenetic import _nodes_by_counts


def _validate_counts_vector(counts, suppress_cast=False):
    """Validate and convert input to an acceptable counts vector type.

    Note: may not always return a copy of `counts`!

    """
    counts = np.asarray(counts)

    if not suppress_cast:
        counts = counts.astype(int, casting='safe', copy=False)

    if counts.ndim != 1:
        raise ValueError("Only 1-D vectors are supported.")
    elif (counts < 0).any():
        raise ValueError("Counts vector cannot contain negative values.")

    return counts


def _validate_counts_matrix(counts, ids=None, suppress_cast=False):
    results = []

    # handle case of where counts is a single vector by making it a matrix.
    # this has to be done before forcing counts into an ndarray because we
    # don't yet know that all of the entries are of equal length
    if len(counts) == 0 or not isinstance(counts[0], collections.Iterable):
        counts = [counts]
    counts = np.asarray(counts)
    if counts.ndim > 2:
        raise ValueError("Only 1-D and 2-D array-like objects can be provided "
                         "as input. Provided object has %d dimensions." %
                         counts.ndim)

    if ids is not None and len(counts) != len(ids):
        raise ValueError(
            "Number of rows in ``counts`` must be equal to number of provided "
            "``ids``.")

    lens = []
    for v in counts:
        results.append(_validate_counts_vector(v, suppress_cast))
        lens.append(len(v))
    if len(set(lens)) > 1:
        raise ValueError("All rows in ``counts`` must be of equal length.")

    return np.asarray(results)


def _validate_otu_ids_and_tree(counts, otu_ids, tree):
    len_otu_ids = len(otu_ids)
    set_otu_ids = set(otu_ids)
    if len_otu_ids != len(set_otu_ids):
        raise ValueError("``otu_ids`` cannot contain duplicated ids.")

    if len(counts) != len_otu_ids:
        raise ValueError("``otu_ids`` must be the same length as ``counts`` "
                         "vector(s).")

    if len(tree.root().children) == 0:
        raise ValueError("``tree`` must contain more than just a root node.")

    if len(tree.root().children) > 2:
        # this is an imperfect check for whether the tree is rooted or not.
        # can this be improved?
        raise ValueError("``tree`` must be rooted.")

    # all nodes (except the root node) have corresponding branch lengths
    # all tip names in tree are unique
    # all otu_ids correspond to tip names in tree
    branch_lengths = []
    tip_names = []
    for e in tree.traverse():
        if not e.is_root():
            branch_lengths.append(e.length)
        if e.is_tip():
            tip_names.append(e.name)
    set_tip_names = set(tip_names)
    if len(tip_names) != len(set_tip_names):
        raise DuplicateNodeError("All tip names must be unique.")
    if np.array([l is None for l in branch_lengths]).any():
        raise ValueError("All non-root nodes in ``tree`` must have a branch "
                         "length.")
    missing_tip_names = set_otu_ids - set_tip_names
    if missing_tip_names != set():
        n_missing_tip_names = len(missing_tip_names)
        raise MissingNodeError("All ``otu_ids`` must be present as tip names "
                               "in ``tree``. ``otu_ids`` not corresponding to "
                               "tip names (n=%d): %s" %
                               (n_missing_tip_names,
                                " ".join(missing_tip_names)))


def _vectorize_counts_and_tree(counts, otu_ids, tree):
    """ Index tree and convert counts to np.array in corresponding order
    """
    tree_index = tree.to_array(nan_length_value=0.0)
    otu_ids = np.asarray(otu_ids)
    counts = np.atleast_2d(counts)
    counts_by_node = _nodes_by_counts(counts, otu_ids, tree_index)
    branch_lengths = tree_index['length']

    # branch_lengths is just a reference to the array inside of tree_index,
    # but it's used so much that it's convenient to just pull it out here.
    return counts_by_node.T, tree_index, branch_lengths


def _get_phylogenetic_kwargs(counts, **kwargs):
    try:
        otu_ids = kwargs.pop('otu_ids')
    except KeyError:
        raise ValueError("``otu_ids`` is required for phylogenetic diversity "
                         "metrics.")
    try:
        tree = kwargs.pop('tree')
    except KeyError:
        raise ValueError("``tree`` is required for phylogenetic diversity "
                         "metrics.")

    return otu_ids, tree, kwargs