File: test_incremental_pca.py

package info (click to toggle)
scikit-learn 0.20.2%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 51,036 kB
  • sloc: python: 108,171; ansic: 8,722; cpp: 5,651; makefile: 192; sh: 40
file content (349 lines) | stat: -rw-r--r-- 13,214 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
"""Tests for Incremental PCA."""
import numpy as np

from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raises_regex
from sklearn.utils.testing import assert_allclose_dense_sparse

from sklearn import datasets
from sklearn.decomposition import PCA, IncrementalPCA

iris = datasets.load_iris()


def test_incremental_pca():
    # Incremental PCA on dense arrays.
    X = iris.data
    batch_size = X.shape[0] // 3
    ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
    pca = PCA(n_components=2)
    pca.fit_transform(X)

    X_transformed = ipca.fit_transform(X)

    np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2))
    assert_almost_equal(ipca.explained_variance_ratio_.sum(),
                        pca.explained_variance_ratio_.sum(), 1)

    for n_components in [1, 2, X.shape[1]]:
        ipca = IncrementalPCA(n_components, batch_size=batch_size)
        ipca.fit(X)
        cov = ipca.get_covariance()
        precision = ipca.get_precision()
        assert_array_almost_equal(np.dot(cov, precision),
                                  np.eye(X.shape[1]))


def test_incremental_pca_check_projection():
    # Test that the projection of data is correct.
    rng = np.random.RandomState(1999)
    n, p = 100, 3
    X = rng.randn(n, p) * .1
    X[:10] += np.array([3, 4, 5])
    Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])

    # Get the reconstruction of the generated data X
    # Note that Xt has the same "components" as X, just separated
    # This is what we want to ensure is recreated correctly
    Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt)

    # Normalize
    Yt /= np.sqrt((Yt ** 2).sum())

    # Make sure that the first element of Yt is ~1, this means
    # the reconstruction worked as expected
    assert_almost_equal(np.abs(Yt[0][0]), 1., 1)


def test_incremental_pca_inverse():
    # Test that the projection of data can be inverted.
    rng = np.random.RandomState(1999)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean

    # same check that we can find the original data from the transformed
    # signal (since the data is almost of rank n_components)
    ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X)
    Y = ipca.transform(X)
    Y_inverse = ipca.inverse_transform(Y)
    assert_almost_equal(X, Y_inverse, decimal=3)


def test_incremental_pca_validation():
    # Test that n_components is >=1 and <= n_features.
    X = np.array([[0, 1, 0], [1, 0, 0]])
    n_samples, n_features = X.shape
    for n_components in [-1, 0, .99, 4]:
        assert_raises_regex(ValueError,
                            "n_components={} invalid for n_features={}, need"
                            " more rows than columns for IncrementalPCA "
                            "processing".format(n_components, n_features),
                            IncrementalPCA(n_components, batch_size=10).fit, X)

    # Tests that n_components is also <= n_samples.
    n_components = 3
    assert_raises_regex(ValueError,
                        "n_components={} must be less or equal to "
                        "the batch number of samples {}".format(
                            n_components, n_samples),
                        IncrementalPCA(
                            n_components=n_components).partial_fit, X)


def test_n_components_none():
    # Ensures that n_components == None is handled correctly
    rng = np.random.RandomState(1999)
    for n_samples, n_features in [(50, 10), (10, 50)]:
        X = rng.rand(n_samples, n_features)
        ipca = IncrementalPCA(n_components=None)

        # First partial_fit call, ipca.n_components_ is inferred from
        # min(X.shape)
        ipca.partial_fit(X)
        assert ipca.n_components_ == min(X.shape)

        # Second partial_fit call, ipca.n_components_ is inferred from
        # ipca.components_ computed from the first partial_fit call
        ipca.partial_fit(X)
        assert ipca.n_components_ == ipca.components_.shape[0]


def test_incremental_pca_set_params():
    # Test that components_ sign is stable over batch sizes.
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 20
    X = rng.randn(n_samples, n_features)
    X2 = rng.randn(n_samples, n_features)
    X3 = rng.randn(n_samples, n_features)
    ipca = IncrementalPCA(n_components=20)
    ipca.fit(X)
    # Decreasing number of components
    ipca.set_params(n_components=10)
    assert_raises(ValueError, ipca.partial_fit, X2)
    # Increasing number of components
    ipca.set_params(n_components=15)
    assert_raises(ValueError, ipca.partial_fit, X3)
    # Returning to original setting
    ipca.set_params(n_components=20)
    ipca.partial_fit(X)


def test_incremental_pca_num_features_change():
    # Test that changing n_components will raise an error.
    rng = np.random.RandomState(1999)
    n_samples = 100
    X = rng.randn(n_samples, 20)
    X2 = rng.randn(n_samples, 50)
    ipca = IncrementalPCA(n_components=None)
    ipca.fit(X)
    assert_raises(ValueError, ipca.partial_fit, X2)


def test_incremental_pca_batch_signs():
    # Test that components_ sign is stable over batch sizes.
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 3
    X = rng.randn(n_samples, n_features)
    all_components = []
    batch_sizes = np.arange(10, 20)
    for batch_size in batch_sizes:
        ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
        all_components.append(ipca.components_)

    for i, j in zip(all_components[:-1], all_components[1:]):
        assert_almost_equal(np.sign(i), np.sign(j), decimal=6)


def test_incremental_pca_batch_values():
    # Test that components_ values are stable over batch sizes.
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 3
    X = rng.randn(n_samples, n_features)
    all_components = []
    batch_sizes = np.arange(20, 40, 3)
    for batch_size in batch_sizes:
        ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
        all_components.append(ipca.components_)

    for i, j in zip(all_components[:-1], all_components[1:]):
        assert_almost_equal(i, j, decimal=1)


def test_incremental_pca_batch_rank():
    # Test sample size in each batch is always larger or equal to n_components
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 20
    X = rng.randn(n_samples, n_features)
    all_components = []
    batch_sizes = np.arange(20, 90, 3)
    for batch_size in batch_sizes:
        ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X)
        all_components.append(ipca.components_)

    for components_i, components_j in zip(all_components[:-1],
                                          all_components[1:]):
        assert_allclose_dense_sparse(components_i, components_j)


def test_incremental_pca_partial_fit():
    # Test that fit and partial_fit get equivalent results.
    rng = np.random.RandomState(1999)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean

    # same check that we can find the original data from the transformed
    # signal (since the data is almost of rank n_components)
    batch_size = 10
    ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X)
    pipca = IncrementalPCA(n_components=2, batch_size=batch_size)
    # Add one to make sure endpoint is included
    batch_itr = np.arange(0, n + 1, batch_size)
    for i, j in zip(batch_itr[:-1], batch_itr[1:]):
        pipca.partial_fit(X[i:j, :])
    assert_almost_equal(ipca.components_, pipca.components_, decimal=3)


def test_incremental_pca_against_pca_iris():
    # Test that IncrementalPCA and PCA are approximate (to a sign flip).
    X = iris.data

    Y_pca = PCA(n_components=2).fit_transform(X)
    Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X)

    assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)


def test_incremental_pca_against_pca_random_data():
    # Test that IncrementalPCA and PCA are approximate (to a sign flip).
    rng = np.random.RandomState(1999)
    n_samples = 100
    n_features = 3
    X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features)

    Y_pca = PCA(n_components=3).fit_transform(X)
    Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X)

    assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)


def test_explained_variances():
    # Test that PCA and IncrementalPCA calculations match
    X = datasets.make_low_rank_matrix(1000, 100, tail_strength=0.,
                                      effective_rank=10, random_state=1999)
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 99]:
        pca = PCA(n_components=nc).fit(X)
        ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X)
        assert_almost_equal(pca.explained_variance_, ipca.explained_variance_,
                            decimal=prec)
        assert_almost_equal(pca.explained_variance_ratio_,
                            ipca.explained_variance_ratio_, decimal=prec)
        assert_almost_equal(pca.noise_variance_, ipca.noise_variance_,
                            decimal=prec)


def test_singular_values():
    # Check that the IncrementalPCA output has the correct singular values

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 100

    X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
                                      effective_rank=10, random_state=rng)

    pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X)
    ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X)
    assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)

    # Compare to the Frobenius norm
    X_pca = pca.transform(X)
    X_ipca = ipca.transform(X)
    assert_array_almost_equal(np.sum(pca.singular_values_**2.0),
                              np.linalg.norm(X_pca, "fro")**2.0, 12)
    assert_array_almost_equal(np.sum(ipca.singular_values_**2.0),
                              np.linalg.norm(X_ipca, "fro")**2.0, 2)

    # Compare to the 2-norms of the score vectors
    assert_array_almost_equal(pca.singular_values_,
                              np.sqrt(np.sum(X_pca**2.0, axis=0)), 12)
    assert_array_almost_equal(ipca.singular_values_,
                              np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2)

    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
                                      effective_rank=3, random_state=rng)

    pca = PCA(n_components=3, svd_solver='full', random_state=rng)
    ipca = IncrementalPCA(n_components=3, batch_size=100)

    X_pca = pca.fit_transform(X)
    X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
    X_pca[:, 0] *= 3.142
    X_pca[:, 1] *= 2.718

    X_hat = np.dot(X_pca, pca.components_)
    pca.fit(X_hat)
    ipca.fit(X_hat)
    assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
    assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)


def test_whitening():
    # Test that PCA and IncrementalPCA transforms match to sign flip.
    X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0.,
                                      effective_rank=2, random_state=1999)
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 9]:
        pca = PCA(whiten=True, n_components=nc).fit(X)
        ipca = IncrementalPCA(whiten=True, n_components=nc,
                              batch_size=250).fit(X)

        Xt_pca = pca.transform(X)
        Xt_ipca = ipca.transform(X)
        assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
        Xinv_ipca = ipca.inverse_transform(Xt_ipca)
        Xinv_pca = pca.inverse_transform(Xt_pca)
        assert_almost_equal(X, Xinv_ipca, decimal=prec)
        assert_almost_equal(X, Xinv_pca, decimal=prec)
        assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)


def test_incremental_pca_partial_fit_float_division():
    # Test to ensure float division is used in all versions of Python
    # (non-regression test for issue #9489)

    rng = np.random.RandomState(0)
    A = rng.randn(5, 3) + 2
    B = rng.randn(7, 3) + 5

    pca = IncrementalPCA(n_components=2)
    pca.partial_fit(A)
    # Set n_samples_seen_ to be a floating point number instead of an int
    pca.n_samples_seen_ = float(pca.n_samples_seen_)
    pca.partial_fit(B)
    singular_vals_float_samples_seen = pca.singular_values_

    pca2 = IncrementalPCA(n_components=2)
    pca2.partial_fit(A)
    pca2.partial_fit(B)
    singular_vals_int_samples_seen = pca2.singular_values_

    np.testing.assert_allclose(singular_vals_float_samples_seen,
                               singular_vals_int_samples_seen)