1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
|
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
import pandas as pd
from scipy.linalg import svd, lstsq
from ._ordination_results import OrdinationResults
from ._utils import corr, svd_rank, scale
from skbio.util._decorator import experimental
@experimental(as_of="0.4.0")
def cca(y, x, scaling=1):
r"""Compute canonical (also known as constrained) correspondence
analysis.
Canonical (or constrained) correspondence analysis is a
multivariate ordination technique. It appeared in community
ecology [1]_ and relates community composition to the variation in
the environment (or in other factors). It works from data on
abundances or counts of samples and constraints variables,
and outputs ordination axes that maximize sample separation among species.
It is better suited to extract the niches of taxa than linear
multivariate methods because it assumes unimodal response curves
(habitat preferences are often unimodal functions of habitat
variables [2]_).
As more environmental variables are added, the result gets more
similar to unconstrained ordination, so only the variables that
are deemed explanatory should be included in the analysis.
Parameters
----------
y : DataFrame
Samples by features table (n, m)
x : DataFrame
Samples by constraints table (n, q)
scaling : int, {1, 2}, optional
Scaling type 1 maintains :math:`\chi^2` distances between rows.
Scaling type 2 preserver :math:`\chi^2` distances between columns.
For a more detailed explanation of the interpretation, check Legendre &
Legendre 1998, section 9.4.3.
Returns
-------
OrdinationResults
Object that stores the cca results.
Raises
------
ValueError
If `x` and `y` have different number of rows
If `y` contains negative values
If `y` contains a row of only 0's.
NotImplementedError
If scaling is not 1 or 2.
See Also
--------
ca
rda
OrdinationResults
Notes
-----
The algorithm is based on [3]_, \S 11.2, and is expected to give
the same results as ``cca(y, x)`` in R's package vegan, except
that this implementation won't drop constraining variables due to
perfect collinearity: the user needs to choose which ones to
input.
Canonical *correspondence* analysis shouldn't be confused with
canonical *correlation* analysis (CCorA, but sometimes called
CCA), a different technique to search for multivariate
relationships between two datasets. Canonical correlation analysis
is a statistical tool that, given two vectors of random variables,
finds linear combinations that have maximum correlation with each
other. In some sense, it assumes linear responses of "species" to
"environmental variables" and is not well suited to analyze
ecological data.
References
----------
.. [1] Cajo J. F. Ter Braak, "Canonical Correspondence Analysis: A
New Eigenvector Technique for Multivariate Direct Gradient
Analysis", Ecology 67.5 (1986), pp. 1167-1179.
.. [2] Cajo J.F. Braak and Piet F.M. Verdonschot, "Canonical
correspondence analysis and related multivariate methods in
aquatic ecology", Aquatic Sciences 57.3 (1995), pp. 255-289.
.. [3] Legendre P. and Legendre L. 1998. Numerical
Ecology. Elsevier, Amsterdam.
"""
Y = y.as_matrix()
X = x.as_matrix()
# Perform parameter sanity checks
if X.shape[0] != Y.shape[0]:
raise ValueError("The samples by features table 'y' and the samples by"
" constraints table 'x' must have the same number of "
" rows. 'y': {0} 'x': {1}".format(X.shape[0],
Y.shape[0]))
if Y.min() < 0:
raise ValueError(
"The samples by features table 'y' must be nonnegative")
row_max = Y.max(axis=1)
if np.any(row_max <= 0):
# Or else the lstsq call to compute Y_hat breaks
raise ValueError("The samples by features table 'y' cannot contain a "
"row with only 0's")
if scaling not in {1, 2}:
raise NotImplementedError(
"Scaling {0} not implemented.".format(scaling))
# Step 1 (similar to Pearson chi-square statistic)
grand_total = Y.sum()
Q = Y / grand_total # Relative frequencies of Y (contingency table)
# Features and sample weights (marginal totals)
column_marginals = Q.sum(axis=0)
row_marginals = Q.sum(axis=1)
# Formula 9.32 in Lagrange & Lagrange (1998). Notice that it's an
# scaled version of the contribution of each cell towards Pearson
# chi-square statistic.
expected = np.outer(row_marginals, column_marginals)
Q_bar = (Q - expected) / np.sqrt(expected)
# Step 2. Standardize columns of X with respect to sample weights,
# using the maximum likelihood variance estimator (Legendre &
# Legendre 1998, p. 595)
X = scale(X, weights=row_marginals, ddof=0)
# Step 3. Weighted multiple regression.
X_weighted = row_marginals[:, None]**0.5 * X
B, _, rank_lstsq, _ = lstsq(X_weighted, Q_bar)
Y_hat = X_weighted.dot(B)
Y_res = Q_bar - Y_hat
# Step 4. Eigenvalue decomposition
u, s, vt = svd(Y_hat, full_matrices=False)
rank = svd_rank(Y_hat.shape, s)
s = s[:rank]
u = u[:, :rank]
vt = vt[:rank]
U = vt.T
# Step 5. Eq. 9.38
U_hat = Q_bar.dot(U) * s**-1
# Residuals analysis
u_res, s_res, vt_res = svd(Y_res, full_matrices=False)
rank = svd_rank(Y_res.shape, s_res)
s_res = s_res[:rank]
u_res = u_res[:, :rank]
vt_res = vt_res[:rank]
U_res = vt_res.T
U_hat_res = Y_res.dot(U_res) * s_res**-1
eigenvalues = np.r_[s, s_res]**2
# Scalings (p. 596 L&L 1998):
# feature scores, scaling 1
V = (column_marginals**-0.5)[:, None] * U
# sample scores, scaling 2
V_hat = (row_marginals**-0.5)[:, None] * U_hat
# sample scores, scaling 1
F = V_hat * s
# feature scores, scaling 2
F_hat = V * s
# Sample scores which are linear combinations of constraint
# variables
Z_scaling1 = ((row_marginals**-0.5)[:, None] *
Y_hat.dot(U))
Z_scaling2 = Z_scaling1 * s**-1
# Feature residual scores, scaling 1
V_res = (column_marginals**-0.5)[:, None] * U_res
# Sample residual scores, scaling 2
V_hat_res = (row_marginals**-0.5)[:, None] * U_hat_res
# Sample residual scores, scaling 1
F_res = V_hat_res * s_res
# Feature residual scores, scaling 2
F_hat_res = V_res * s_res
eigvals = eigenvalues
if scaling == 1:
features_scores = np.hstack((V, V_res))
sample_scores = np.hstack((F, F_res))
sample_constraints = np.hstack((Z_scaling1, F_res))
elif scaling == 2:
features_scores = np.hstack((F_hat, F_hat_res))
sample_scores = np.hstack((V_hat, V_hat_res))
sample_constraints = np.hstack((Z_scaling2, V_hat_res))
biplot_scores = corr(X_weighted, u)
pc_ids = ['CCA%d' % (i+1) for i in range(len(eigenvalues))]
sample_ids = y.index
feature_ids = y.columns
eigvals = pd.Series(eigenvalues, index=pc_ids)
samples = pd.DataFrame(sample_scores,
columns=pc_ids, index=sample_ids)
features = pd.DataFrame(features_scores,
columns=pc_ids, index=feature_ids)
biplot_scores = pd.DataFrame(biplot_scores,
index=x.columns,
columns=pc_ids[:biplot_scores.shape[1]])
sample_constraints = pd.DataFrame(sample_constraints,
index=sample_ids, columns=pc_ids)
return OrdinationResults(
"CCA", "Canonical Correspondence Analysis", eigvals, samples,
features=features, biplot_scores=biplot_scores,
sample_constraints=sample_constraints,
proportion_explained=eigvals / eigvals.sum())
|