1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
|
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
import pandas as pd
from scipy.linalg import svd, lstsq
from skbio.util._decorator import experimental
from ._ordination_results import OrdinationResults
from ._utils import corr, svd_rank, scale
@experimental(as_of="0.4.0")
def rda(y, x, scale_Y=False, scaling=1):
r"""Compute redundancy analysis, a type of canonical analysis.
It is related to PCA and multiple regression because the explained
variables `y` are fitted to the explanatory variables `x` and PCA
is then performed on the fitted values. A similar process is
performed on the residuals.
RDA should be chosen if the studied gradient is small, and CCA
when it's large, so that the contingency table is sparse.
Parameters
----------
y : pd.DataFrame
:math:`n \times p` response matrix, where :math:`n` is the number
of samples and :math:`p` is the number of features. Its columns
need be dimensionally homogeneous (or you can set `scale_Y=True`).
This matrix is also referred to as the community matrix that
commonly stores information about species abundances
x : pd.DataFrame
:math:`n \times m, n \geq m` matrix of explanatory
variables, where :math:`n` is the number of samples and
:math:`m` is the number of metadata variables. Its columns
need not be standardized, but doing so turns regression
coefficients into standard regression coefficients.
scale_Y : bool, optional
Controls whether the response matrix columns are scaled to
have unit standard deviation. Defaults to `False`.
scaling : int
Scaling type 1 produces a distance biplot. It focuses on
the ordination of rows (samples) because their transformed
distances approximate their original euclidean
distances. Especially interesting when most explanatory
variables are binary.
Scaling type 2 produces a correlation biplot. It focuses
on the relationships among explained variables (`y`). It
is interpreted like scaling type 1, but taking into
account that distances between objects don't approximate
their euclidean distances.
See more details about distance and correlation biplots in
[1]_, \S 9.1.4.
Returns
-------
OrdinationResults
Object that stores the computed eigenvalues, the
proportion explained by each of them (per unit),
transformed coordinates for feature and samples, biplot
scores, sample constraints, etc.
See Also
--------
ca
cca
OrdinationResults
Notes
-----
The algorithm is based on [1]_, \S 11.1, and is expected to
give the same results as ``rda(y, x)`` in R's package vegan.
The eigenvalues reported in vegan are re-normalized to
:math:`\sqrt{\frac{s}{n-1}}` `n` is the number of samples,
and `s` is the original eigenvalues. Here we will only return
the original eigenvalues, as recommended in [1]_.
References
----------
.. [1] Legendre P. and Legendre L. 1998. Numerical
Ecology. Elsevier, Amsterdam.
"""
Y = y.values
X = x.values
n, p = y.shape
n_, m = x.shape
if n != n_:
raise ValueError(
"Both data matrices must have the same number of rows.")
if n < m:
# Mmm actually vegan is able to do this case, too
raise ValueError(
"Explanatory variables cannot have less rows than columns.")
sample_ids = y.index
feature_ids = y.columns
# Centre response variables (they must be dimensionally
# homogeneous)
Y = scale(Y, with_std=scale_Y)
# Centre explanatory variables
X = scale(X, with_std=False)
# Distribution of variables should be examined and transformed
# if necessary (see paragraph 4 in p. 580 L&L 1998)
# Compute Y_hat (fitted values by multivariate linear
# regression, that is, linear least squares). Formula 11.6 in
# L&L 1998 involves solving the normal equations, but that fails
# when cond(X) ~ eps**(-0.5). A more expensive but much more
# stable solution (fails when cond(X) ~ eps**-1) is computed
# using the QR decomposition of X = QR:
# (11.6) Y_hat = X [X' X]^{-1} X' Y
# = QR [R'Q' QR]^{-1} R'Q' Y
# = QR [R' R]^{-1} R'Q' Y
# = QR R^{-1} R'^{-1} R' Q' Y
# = Q Q' Y
# and B (matrix of regression coefficients)
# (11.4) B = [X' X]^{-1} X' Y
# = R^{-1} R'^{-1} R' Q' Y
# = R^{-1} Q'
# Q, R = np.linalg.qr(X)
# Y_hat = Q.dot(Q.T).dot(Y)
# B = scipy.linalg.solve_triangular(R, Q.T.dot(Y))
# This works provided X has full rank. When not, you can still
# fix it using R's pseudoinverse or partitioning R. To avoid any
# issues, like the numerical instability when trying to
# reproduce an example in L&L where X was rank-deficient, we'll
# just use `np.linalg.lstsq`, which uses the SVD decomposition
# under the hood and so it's also more expensive.
B, _, rank_X, _ = lstsq(X, Y)
Y_hat = X.dot(B)
# Now let's perform PCA on the fitted values from the multiple
# regression
u, s, vt = svd(Y_hat, full_matrices=False)
# vt are the right eigenvectors, which is what we need to
# perform PCA. That is, we're changing points in Y_hat from the
# canonical basis to the orthonormal basis given by the right
# eigenvectors of Y_hat (or equivalently, the eigenvectors of
# the covariance matrix Y_hat.T.dot(Y_hat))
# See 3) in p. 583 in L&L 1998
rank = svd_rank(Y_hat.shape, s)
# Theoretically, there're at most min(p, m, n - 1) non-zero eigenvalues
U = vt[:rank].T # U as in Fig. 11.2
# Ordination in the space of response variables. Its columns are
# sample scores. (Eq. 11.12)
F = Y.dot(U)
# Ordination in the space of explanatory variables. Its columns
# are fitted sample scores. (Eq. 11.13)
Z = Y_hat.dot(U)
# Canonical coefficients (formula 11.14)
# C = B.dot(U) # Not used
Y_res = Y - Y_hat
# PCA on the residuals
u_res, s_res, vt_res = svd(Y_res, full_matrices=False)
# See 9) in p. 587 in L&L 1998
rank_res = svd_rank(Y_res.shape, s_res)
# Theoretically, there're at most min(p, n - 1) non-zero eigenvalues as
U_res = vt_res[:rank_res].T
F_res = Y_res.dot(U_res) # Ordination in the space of residuals
eigenvalues = np.r_[s[:rank], s_res[:rank_res]]
# Compute scores
if scaling not in {1, 2}:
raise NotImplementedError("Only scalings 1, 2 available for RDA.")
# According to the vegan-FAQ.pdf, the scaling factor for scores
# is (notice that L&L 1998 says in p. 586 that such scaling
# doesn't affect the interpretation of a biplot):
eigvals = pd.Series(
eigenvalues, index=['RDA%d' % (i+1) for i in range(len(eigenvalues))])
const = np.sum(eigenvalues**2)**0.25
if scaling == 1:
scaling_factor = const
elif scaling == 2:
scaling_factor = eigenvalues / const
feature_scores = np.hstack((U, U_res)) * scaling_factor
sample_scores = np.hstack((F, F_res)) / scaling_factor
feature_scores = pd.DataFrame(
feature_scores, index=feature_ids,
columns=['RDA%d' % (i+1) for i in range(feature_scores.shape[1])])
sample_scores = pd.DataFrame(
sample_scores, index=sample_ids,
columns=['RDA%d' % (i+1) for i in range(sample_scores.shape[1])])
# TODO not yet used/displayed
sample_constraints = np.hstack((Z, F_res)) / scaling_factor
sample_constraints = pd.DataFrame(
sample_constraints, index=sample_ids,
columns=['RDA%d' % (i+1) for i in range(sample_constraints.shape[1])])
# Vegan seems to compute them as corr(X[:, :rank_X],
# u) but I don't think that's a good idea. In fact, if
# you take the example shown in Figure 11.3 in L&L 1998 you
# can see that there's an arrow for each of the 4
# environmental variables (depth, coral, sand, other) even if
# other = not(coral or sand)
biplot_scores = corr(X, u)
biplot_scores = pd.DataFrame(
biplot_scores, index=x.columns,
columns=['RDA%d' % (i+1) for i in range(biplot_scores.shape[1])])
# The "Correlations of environmental variables with sample
# scores" from table 11.4 are quite similar to vegan's biplot
# scores, but they're computed like this:
# corr(X, F))
p_explained = pd.Series(
eigenvalues / eigenvalues.sum(),
index=['RDA%d' % (i+1) for i in range(len(eigenvalues))])
return OrdinationResults('RDA', 'Redundancy Analysis',
eigvals=eigvals,
proportion_explained=p_explained,
features=feature_scores,
samples=sample_scores,
biplot_scores=biplot_scores,
sample_constraints=sample_constraints)
|