1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
|
"""
To run this, you'll need to have installed.
* scikit-learn
Does two benchmarks
First, we fix a training set, increase the number of
samples to classify and plot number of classified samples as a
function of time.
In the second benchmark, we increase the number of dimensions of the
training set, classify a sample and plot the time taken as a function
of the number of dimensions.
"""
import numpy as np
import pylab as pl
import gc
from datetime import datetime
# to store the results
scikit_classifier_results = []
scikit_regressor_results = []
mu_second = 0.0 + 10 ** 6 # number of microseconds in a second
def bench_scikit_tree_classifier(X, Y):
"""Bench with scikit-learn decision tree classifier"""
from sklearn.tree import DecisionTreeClassifier
gc.collect()
# start time
tstart = datetime.now()
clf = DecisionTreeClassifier()
clf.fit(X, Y).predict(X)
delta = (datetime.now() - tstart)
# stop time
scikit_classifier_results.append(
delta.seconds + delta.microseconds / mu_second)
def bench_scikit_tree_regressor(X, Y):
"""Bench with scikit-learn decision tree regressor"""
from sklearn.tree import DecisionTreeRegressor
gc.collect()
# start time
tstart = datetime.now()
clf = DecisionTreeRegressor()
clf.fit(X, Y).predict(X)
delta = (datetime.now() - tstart)
# stop time
scikit_regressor_results.append(
delta.seconds + delta.microseconds / mu_second)
if __name__ == '__main__':
print '============================================'
print 'Warning: this is going to take a looong time'
print '============================================'
n = 10
step = 10000
n_samples = 10000
dim = 10
n_classes = 10
for i in range(n):
print '============================================'
print 'Entering iteration %s of %s' % (i, n)
print '============================================'
n_samples += step
X = np.random.randn(n_samples, dim)
Y = np.random.randint(0, n_classes, (n_samples,))
bench_scikit_tree_classifier(X, Y)
Y = np.random.randn(n_samples)
bench_scikit_tree_regressor(X, Y)
xx = range(0, n * step, step)
pl.figure(1)
pl.subplot(211)
pl.title('Learning with varying number of samples')
pl.plot(xx, scikit_classifier_results, 'g-', label='classification')
pl.plot(xx, scikit_regressor_results, 'r-', label='regression')
pl.legend()
pl.xlabel('number of samples')
pl.ylabel('time (in seconds)')
scikit_classifier_results = []
scikit_regressor_results = []
n = 10
step = 500
start_dim = 500
n_classes = 10
dim = start_dim
for i in range(0, n):
print '============================================'
print 'Entering iteration %s of %s' % (i, n)
print '============================================'
dim += step
X = np.random.randn(100, dim)
Y = np.random.randint(0, n_classes, (100,))
bench_scikit_tree_classifier(X, Y)
Y = np.random.randn(100)
bench_scikit_tree_regressor(X, Y)
xx = np.arange(start_dim, start_dim + n * step, step)
pl.subplot(212)
pl.title('Learning in high dimensional spaces')
pl.plot(xx, scikit_classifier_results, 'g-', label='classification')
pl.plot(xx, scikit_regressor_results, 'r-', label='regression')
pl.legend()
pl.xlabel('number of dimensions')
pl.ylabel('time (in seconds)')
pl.axis('tight')
pl.show()
|