File: bench_plot_nmf.py

package info (click to toggle)
scikit-learn 0.11.0-2%2Bdeb7u1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 13,900 kB
  • sloc: python: 34,740; ansic: 8,860; cpp: 8,849; pascal: 230; makefile: 211; sh: 14
file content (162 lines) | stat: -rw-r--r-- 5,619 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Benchmarks of Non-Negative Matrix Factorization
"""

import gc
from time import time
import numpy as np
from collections import defaultdict

from sklearn.decomposition.nmf import NMF, _initialize_nmf
from sklearn.datasets.samples_generator import make_low_rank_matrix


def alt_nnmf(V, r, max_iter=1000, tol=1e-3, R=None):
    '''
    A, S = nnmf(X, r, tol=1e-3, R=None)

    Implement Lee & Seung's algorithm

    Parameters
    ----------
    V : 2-ndarray, [n_samples, n_features]
        input matrix
    r : integer
        number of latent features
    max_iter : integer, optional
        maximum number of iterations (default: 10000)
    tol : double
        tolerance threshold for early exit (when the update factor is within
        tol of 1., the function exits)
    R : integer, optional
        random seed

    Returns
    -------
    A : 2-ndarray, [n_samples, r]
        Component part of the factorization

    S : 2-ndarray, [r, n_features]
        Data part of the factorization
    Reference
    ---------
    "Algorithms for Non-negative Matrix Factorization"
    by Daniel D Lee, Sebastian H Seung
    (available at http://citeseer.ist.psu.edu/lee01algorithms.html)
    '''
    # Nomenclature in the function follows Lee & Seung
    eps = 1e-5
    n, m = V.shape
    if R == "svd":
        W, H = _initialize_nmf(V, r)
    elif R == None:
        R = np.random.mtrand._rand
        W = np.abs(R.standard_normal((n, r)))
        H = np.abs(R.standard_normal((r, m)))

    for i in xrange(max_iter):
        updateH = np.dot(W.T, V) / (np.dot(np.dot(W.T, W), H) + eps)
        H *= updateH
        updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps)
        W *= updateW
        if True or (i % 10) == 0:
            max_update = max(updateW.max(), updateH.max())
            if abs(1. - max_update) < tol:
                break
    return W, H


def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
    it = 0
    timeset = defaultdict(lambda: [])
    err = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = np.abs(make_low_rank_matrix(n_samples, n_features,
                       effective_rank=rank,  tail_strength=0.2))

            gc.collect()
            print "benching nndsvd-nmf: "
            tstart = time()
            m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
            tend = time() - tstart
            timeset['nndsvd-nmf'].append(tend)
            err['nndsvd-nmf'].append(m.reconstruction_err_)
            print m.reconstruction_err_, tend

            gc.collect()
            print "benching nndsvda-nmf: "
            tstart = time()
            m = NMF(n_components=30, init='nndsvda',
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvda-nmf'].append(tend)
            err['nndsvda-nmf'].append(m.reconstruction_err_)
            print m.reconstruction_err_, tend

            gc.collect()
            print "benching nndsvdar-nmf: "
            tstart = time()
            m = NMF(n_components=30, init='nndsvdar',
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvdar-nmf'].append(tend)
            err['nndsvdar-nmf'].append(m.reconstruction_err_)
            print m.reconstruction_err_, tend

            gc.collect()
            print "benching random-nmf"
            tstart = time()
            m = NMF(n_components=30, init=None, max_iter=1000,
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['random-nmf'].append(tend)
            err['random-nmf'].append(m.reconstruction_err_)
            print m.reconstruction_err_, tend

            gc.collect()
            print "benching alt-random-nmf"
            tstart = time()
            W, H = alt_nnmf(X, r=30, R=None, tol=tolerance)
            tend = time() - tstart
            timeset['alt-random-nmf'].append(tend)
            err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
            print np.linalg.norm(X - np.dot(W, H)), tend

    return timeset, err


if __name__ == '__main__':
    from mpl_toolkits.mplot3d import axes3d  # register the 3d projection
    import matplotlib.pyplot as plt

    samples_range = np.linspace(50, 500, 3).astype(np.int)
    features_range = np.linspace(50, 500, 3).astype(np.int)
    timeset, err = compute_bench(samples_range, features_range)

    for i, results in enumerate((timeset, err)):
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())):
            X, Y = np.meshgrid(samples_range, features_range)
            Z = np.asarray(timings).reshape(samples_range.shape[0],
                                            features_range.shape[0])
            # plot the actual surface
            ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3,
                            color=c)
            # dummy point plot to stick the legend to since surface plot do not
            # support legends (yet?)
            ax.plot([1], [1], [1], color=c, label=label)

        ax.set_xlabel('n_samples')
        ax.set_ylabel('n_features')
        zlabel = 'time (s)' if i == 0 else 'reconstruction error'
        ax.set_zlabel(zlabel)
        ax.legend()
        plt.show()