File: _canonical_correspondence_analysis.py

package info (click to toggle)
python-skbio 0.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 16,556 kB
  • ctags: 7,222
  • sloc: python: 42,085; ansic: 670; makefile: 180; sh: 10
file content (234 lines) | stat: -rw-r--r-- 8,409 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import numpy as np
import pandas as pd
from scipy.linalg import svd, lstsq

from ._ordination_results import OrdinationResults
from ._utils import corr, svd_rank, scale
from skbio.util._decorator import experimental


@experimental(as_of="0.4.0")
def cca(y, x, scaling=1):
    r"""Compute canonical (also known as constrained) correspondence
    analysis.

    Canonical (or constrained) correspondence analysis is a
    multivariate ordination technique. It appeared in community
    ecology [1]_ and relates community composition to the variation in
    the environment (or in other factors). It works from data on
    abundances or counts of samples and constraints variables,
    and outputs ordination axes that maximize sample separation among species.

    It is better suited to extract the niches of taxa than linear
    multivariate methods because it assumes unimodal response curves
    (habitat preferences are often unimodal functions of habitat
    variables [2]_).

    As more environmental variables are added, the result gets more
    similar to unconstrained ordination, so only the variables that
    are deemed explanatory should be included in the analysis.

    Parameters
    ----------
    y : DataFrame
        Samples by features table (n, m)
    x : DataFrame
        Samples by constraints table (n, q)
    scaling : int, {1, 2}, optional
        Scaling type 1 maintains :math:`\chi^2` distances between rows.
        Scaling type 2 preserver :math:`\chi^2` distances between columns.
        For a more detailed explanation of the interpretation, check Legendre &
        Legendre 1998, section 9.4.3.

    Returns
    -------
    OrdinationResults
        Object that stores the cca results.

    Raises
    ------
    ValueError
        If `x` and `y` have different number of rows
        If `y` contains negative values
        If `y` contains a row of only 0's.
    NotImplementedError
        If scaling is not 1 or 2.

    See Also
    --------
    ca
    rda
    OrdinationResults

    Notes
    -----
    The algorithm is based on [3]_, \S 11.2, and is expected to give
    the same results as ``cca(y, x)`` in R's package vegan, except
    that this implementation won't drop constraining variables due to
    perfect collinearity: the user needs to choose which ones to
    input.

    Canonical *correspondence* analysis shouldn't be confused with
    canonical *correlation* analysis (CCorA, but sometimes called
    CCA), a different technique to search for multivariate
    relationships between two datasets. Canonical correlation analysis
    is a statistical tool that, given two vectors of random variables,
    finds linear combinations that have maximum correlation with each
    other. In some sense, it assumes linear responses of "species" to
    "environmental variables" and is not well suited to analyze
    ecological data.

    References
    ----------
    .. [1] Cajo J. F. Ter Braak, "Canonical Correspondence Analysis: A
        New Eigenvector Technique for Multivariate Direct Gradient
        Analysis", Ecology 67.5 (1986), pp. 1167-1179.

    .. [2] Cajo J.F. Braak and Piet F.M. Verdonschot, "Canonical
        correspondence analysis and related multivariate methods in
        aquatic ecology", Aquatic Sciences 57.3 (1995), pp. 255-289.

    .. [3] Legendre P. and Legendre L. 1998. Numerical
       Ecology. Elsevier, Amsterdam.

    """
    Y = y.as_matrix()
    X = x.as_matrix()

    # Perform parameter sanity checks
    if X.shape[0] != Y.shape[0]:
        raise ValueError("The samples by features table 'y' and the samples by"
                         " constraints table 'x' must have the same number of "
                         " rows. 'y': {0} 'x': {1}".format(X.shape[0],
                                                           Y.shape[0]))
    if Y.min() < 0:
        raise ValueError(
            "The samples by features table 'y' must be nonnegative")
    row_max = Y.max(axis=1)
    if np.any(row_max <= 0):
        # Or else the lstsq call to compute Y_hat breaks
        raise ValueError("The samples by features table 'y' cannot contain a "
                         "row with only 0's")
    if scaling not in {1, 2}:
        raise NotImplementedError(
            "Scaling {0} not implemented.".format(scaling))

    # Step 1 (similar to Pearson chi-square statistic)
    grand_total = Y.sum()
    Q = Y / grand_total  # Relative frequencies of Y (contingency table)

    # Features and sample weights (marginal totals)
    column_marginals = Q.sum(axis=0)
    row_marginals = Q.sum(axis=1)

    # Formula 9.32 in Lagrange & Lagrange (1998). Notice that it's an
    # scaled version of the contribution of each cell towards Pearson
    # chi-square statistic.
    expected = np.outer(row_marginals, column_marginals)
    Q_bar = (Q - expected) / np.sqrt(expected)

    # Step 2. Standardize columns of X with respect to sample weights,
    # using the maximum likelihood variance estimator (Legendre &
    # Legendre 1998, p. 595)
    X = scale(X, weights=row_marginals, ddof=0)

    # Step 3. Weighted multiple regression.
    X_weighted = row_marginals[:, None]**0.5 * X
    B, _, rank_lstsq, _ = lstsq(X_weighted, Q_bar)
    Y_hat = X_weighted.dot(B)
    Y_res = Q_bar - Y_hat

    # Step 4. Eigenvalue decomposition
    u, s, vt = svd(Y_hat, full_matrices=False)
    rank = svd_rank(Y_hat.shape, s)
    s = s[:rank]
    u = u[:, :rank]
    vt = vt[:rank]
    U = vt.T

    # Step 5. Eq. 9.38
    U_hat = Q_bar.dot(U) * s**-1

    # Residuals analysis
    u_res, s_res, vt_res = svd(Y_res, full_matrices=False)
    rank = svd_rank(Y_res.shape, s_res)
    s_res = s_res[:rank]
    u_res = u_res[:, :rank]
    vt_res = vt_res[:rank]

    U_res = vt_res.T
    U_hat_res = Y_res.dot(U_res) * s_res**-1

    eigenvalues = np.r_[s, s_res]**2

    # Scalings (p. 596 L&L 1998):
    # feature scores, scaling 1
    V = (column_marginals**-0.5)[:, None] * U

    # sample scores, scaling 2
    V_hat = (row_marginals**-0.5)[:, None] * U_hat

    # sample scores, scaling 1
    F = V_hat * s

    # feature scores, scaling 2
    F_hat = V * s

    # Sample scores which are linear combinations of constraint
    # variables
    Z_scaling1 = ((row_marginals**-0.5)[:, None] *
                  Y_hat.dot(U))
    Z_scaling2 = Z_scaling1 * s**-1

    # Feature residual scores, scaling 1
    V_res = (column_marginals**-0.5)[:, None] * U_res

    # Sample residual scores, scaling 2
    V_hat_res = (row_marginals**-0.5)[:, None] * U_hat_res

    # Sample residual scores, scaling 1
    F_res = V_hat_res * s_res

    # Feature residual scores, scaling 2
    F_hat_res = V_res * s_res

    eigvals = eigenvalues
    if scaling == 1:
        features_scores = np.hstack((V, V_res))
        sample_scores = np.hstack((F, F_res))
        sample_constraints = np.hstack((Z_scaling1, F_res))
    elif scaling == 2:
        features_scores = np.hstack((F_hat, F_hat_res))
        sample_scores = np.hstack((V_hat, V_hat_res))
        sample_constraints = np.hstack((Z_scaling2, V_hat_res))

    biplot_scores = corr(X_weighted, u)

    pc_ids = ['CCA%d' % (i+1) for i in range(len(eigenvalues))]
    sample_ids = y.index
    feature_ids = y.columns
    eigvals = pd.Series(eigenvalues, index=pc_ids)
    samples = pd.DataFrame(sample_scores,
                           columns=pc_ids, index=sample_ids)
    features = pd.DataFrame(features_scores,
                            columns=pc_ids, index=feature_ids)

    biplot_scores = pd.DataFrame(biplot_scores,
                                 index=x.columns,
                                 columns=pc_ids[:biplot_scores.shape[1]])
    sample_constraints = pd.DataFrame(sample_constraints,
                                      index=sample_ids, columns=pc_ids)

    return OrdinationResults(
        "CCA", "Canonical Correspondence Analysis", eigvals, samples,
        features=features, biplot_scores=biplot_scores,
        sample_constraints=sample_constraints,
        proportion_explained=eigvals / eigvals.sum())