1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
|
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------
from functools import partial
import numpy as np
import pandas as pd
from scipy.stats import f_oneway
from scipy.spatial.distance import cdist
from ._cutils import geomedian_axis_one
from ._base import (
_preprocess_input_sng,
_run_monte_carlo_stats,
_build_results,
DistanceMatrix,
)
from skbio.stats.ordination import pcoa, OrdinationResults
def permdisp(
distance_matrix,
grouping,
column=None,
test="median",
permutations=999,
method="eigh",
number_of_dimensions=10,
):
"""Test for Homogeneity of Multivariate Groups Disperisons.
PERMDISP is a multivariate analogue of Levene's test for homogeneity of
multivariate variances. Distances are handled by reducing the
original distances to principal coordinates. PERMDISP calculates an
F-statistic to assess whether the dispersions between groups is significant
Parameters
----------
distance_matrix : DistanceMatrix or OrdinationResults
Distance matrix containing distances between objects (e.g., distances
between samples of microbial communities) or
result of pcoa on such a matrix.
grouping : 1-D array_like or pandas.DataFrame
Vector indicating the assignment of objects to groups. For example,
these could be strings or integers denoting which group an object
belongs to. If `grouping` is 1-D ``array_like``, it must be the same
length and in the same order as the objects in `distance_matrix`. If
`grouping` is a ``DataFrame``, the column specified by `column` will be
used as the grouping vector. The ``DataFrame`` must be indexed by the
IDs in `distance_matrix` (i.e., the row labels must be distance matrix
IDs), but the order of IDs between `distance_matrix` and the
``DataFrame`` need not be the same. All IDs in the distance matrix must
be present in the ``DataFrame``. Extra IDs in the ``DataFrame`` are
allowed (they are ignored in the calculations).
column : str, optional
Column name to use as the grouping vector if `grouping` is a
``DataFrame``. Must be provided if `grouping` is a ``DataFrame``.
Cannot be provided if `grouping` is 1-D ``array_like``.
test : {'centroid', 'median'}
determines whether the analysis is done using centroid or spaitial
median.
permutations : int, optional
Number of permutations to use when assessing statistical
significance. Must be greater than or equal to zero. If zero,
statistical significance calculations will be skipped and the p-value
will be ``np.nan``.
method : str, optional
Eigendecomposition method to use in performing PCoA.
By default, uses SciPy's `eigh`, which computes exact
eigenvectors and eigenvalues for all dimensions. The alternate
method, `fsvd`, uses faster heuristic eigendecomposition but loses
accuracy. The magnitude of accuracy lost is dependent on dataset.
Note that using `fsvd` is still considered experimental and
should be used with care.
Not used if distance_matrix is a OrdinationResults object.
number_of_dimensions : int, optional
Dimensions to reduce the distance matrix to if using the `fsvd` method.
Not used if the `eigh` method is being selected.
Returns
-------
pandas.Series
Results of the statistical test, including ``test statistic`` and
``p-value``.
Raises
------
TypeError
If, when using the spatial median test, the pcoa ordination is not of
type np.float32 or np.float64, the spatial median function will fail
and the centroid test should be used instead
ValueError
If the test is not centroid or median,
or if method is not eigh or fsvd
TypeError
If the distance matrix is not an instance of a
``skbio.DistanceMatrix``.
ValueError
If there is only one group
ValueError
If a list and a column name are both provided
ValueError
If a list is provided for `grouping` and it's length does not match
the number of ids in distance_matrix
ValueError
If all of the values in the grouping vector are unique
KeyError
If there are ids in grouping that are not in distance_matrix
See Also
--------
permanova
anosim
Notes
-----
This function uses Marti Anderson's PERMDISP2 procedure.
The significance of the results from this function will be the same as the
results found in vegan's betadisper, however due to floating point
variability the F-statistic results may vary slightly.
See [1]_ for the original method reference, as well as
``vegan::betadisper``, available in R's vegan package [2]_.
References
----------
.. [1] Anderson, M. J. (2006). Distance-based tests for homogeneity of multivariate
dispersions. Biometrics, 62(1), 245-253.
.. [2] http://cran.r-project.org/web/packages/vegan/index.html
Examples
--------
Load a 6x6 distance matrix and grouping vector denoting 2 groups of
objects:
>>> from skbio import DistanceMatrix
>>> dm = DistanceMatrix([[0, 0.5, 0.75, 1, 0.66, 0.33],
... [0.5, 0, 0.25, 0.33, 0.77, 0.61],
... [0.75, 0.25, 0, 0.1, 0.44, 0.55],
... [1, 0.33, 0.1, 0, 0.75, 0.88],
... [0.66, 0.77, 0.44, 0.75, 0, 0.77],
... [0.33, 0.61, 0.55, 0.88, 0.77, 0]],
... ['s1', 's2', 's3', 's4', 's5', 's6'])
>>> grouping = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
Run PERMDISP using 99 permutations to caluculate the p-value:
>>> from skbio.stats.distance import permdisp
>>> import numpy as np
>>> #make output deterministic, should not be included during normal use
>>> np.random.seed(0)
>>> permdisp(dm, grouping, permutations=99)
method name PERMDISP
test statistic name F-value
sample size 6
number of groups 2
test statistic ... 1.03...
p-value ...
number of permutations 99
Name: PERMDISP results, dtype: object
The return value is a ``pandas.Series`` object containing the results of
the statistical test.
To suppress calculation of the p-value and only obtain the F statistic,
specify zero permutations:
>>> permdisp(dm, grouping, permutations=0)
method name PERMDISP
test statistic name F-value
sample size 6
number of groups 2
test statistic ... 1.03...
p-value NaN
number of permutations 0
Name: PERMDISP results, dtype: object
PERMDISP computes variances based on two types of tests, using either
centroids or spatial medians, also commonly referred to as a geometric
median. The spatial median is thought to yield a more robust test
statistic, and this test is used by default. Spatial medians are computed
using an iterative algorithm to find the optimally minimum point from all
other points in a group while centroids are computed using a deterministic
formula. As such the two different tests yeild slightly different F
statistics.
>>> np.random.seed(0)
>>> permdisp(dm, grouping, test='centroid', permutations=6)
method name PERMDISP
test statistic name F-value
sample size 6
number of groups 2
test statistic ... 3.67...
p-value ... 0.42...
number of permutations 6
Name: PERMDISP results, dtype: object
You can also provide a ``pandas.DataFrame`` and a column denoting the
grouping instead of a grouping vector. The following DataFrame's
Grouping column specifies the same grouping as the vector we used in the
previous examples.:
>>> import pandas as pd
>>> df = pd.DataFrame.from_dict(
... {'Grouping': {'s1': 'G1', 's2': 'G1', 's3': 'G1', 's4': 'G2',
... 's5': 'G2', 's6': 'G2'}})
>>> permdisp(dm, df, 'Grouping', permutations=6, test='centroid')
method name PERMDISP
test statistic name F-value
sample size 6
number of groups 2
test statistic ... 3.67...
p-value ... 0.42...
number of permutations 6
Name: PERMDISP results, dtype: object
Note that when providing a ``DataFrame``, the ordering of rows and/or
columns does not affect the grouping vector that is extracted. The
``DataFrame`` must be indexed by the distance matrix IDs (i.e., the row
labels must be distance matrix IDs).
If IDs (rows) are present in the ``DataFrame`` but not in the distance
matrix, they are ignored. The previous example's ``s7`` ID illustrates this
behavior: note that even though the ``DataFrame`` had 7 objects, only 6
were used in the test (see the "Sample size" row in the results above to
confirm this). Thus, the ``DataFrame`` can be a superset of the distance
matrix IDs. Note that the reverse is not true: IDs in the distance matrix
*must* be present in the ``DataFrame`` or an error will be raised.
PERMDISP should be used to determine whether the dispersions between the
groups in your distance matrix are significantly separated.
A non-significant test result indicates that group dispersions are similar
to each other. PERMANOVA or ANOSIM should then be used in conjunction to
determine whether clustering within groups is significant.
"""
if test not in ["centroid", "median"]:
raise ValueError("Test must be centroid or median")
if isinstance(distance_matrix, OrdinationResults):
ordination = distance_matrix
ids = ordination.samples.axes[0].to_list()
sample_size = len(ids)
distance_matrix = None # not used anymore, avoid using by mistake
elif isinstance(distance_matrix, DistanceMatrix):
if method == "eigh":
# eigh does not natively support specifying number_of_dimensions
# and pcoa expects it to be 0
number_of_dimensions = 0
elif method != "fsvd":
raise ValueError("Method must be eigh or fsvd")
ids = distance_matrix.ids
sample_size = distance_matrix.shape[0]
ordination = pcoa(
distance_matrix, method=method, number_of_dimensions=number_of_dimensions
)
else:
raise TypeError("Input must be a DistanceMatrix or OrdinationResults.")
samples = ordination.samples
num_groups, grouping = _preprocess_input_sng(ids, sample_size, grouping, column)
test_stat_function = partial(_compute_groups, samples, test)
stat, p_value = _run_monte_carlo_stats(test_stat_function, grouping, permutations)
return _build_results(
"PERMDISP", "F-value", sample_size, num_groups, stat, p_value, permutations
)
def _compute_groups(samples, test_type, grouping):
groups = []
samples["grouping"] = grouping
if test_type == "centroid":
centroids = samples.groupby("grouping").aggregate("mean")
elif test_type == "median":
grouping_cols = samples.columns.to_list()
centroids = samples.groupby("grouping")[grouping_cols].apply(_config_med)
for label, df in samples.groupby("grouping"):
groups.append(
cdist(
df.values[:, :-1].astype("float64"),
[centroids.loc[label].values],
metric="euclidean",
)
)
stat, _ = f_oneway(*groups)
stat = stat[0]
return stat
def _config_med(x):
"""Slice and transpose the vector.
Slice the vector up to the last value to exclude grouping column
and transpose the vector to be compatible with hd.geomedian.
"""
X = x.values[:, :-1]
return pd.Series(np.array(geomedian_axis_one(X.T)), index=x.columns[:-1])
|