File: test_mldata.py

package info (click to toggle)
scikit-learn 0.11.0-2%2Bdeb7u1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 13,900 kB
  • sloc: python: 34,740; ansic: 8,860; cpp: 8,849; pascal: 230; makefile: 211; sh: 14
file content (158 lines) | stat: -rw-r--r-- 5,446 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""Test functionality of mldata fetching utilities."""

from sklearn import datasets
from sklearn.datasets import mldata_filename, fetch_mldata
from sklearn.utils.testing import assert_in, assert_not_in, mock_urllib2
from nose.tools import assert_equal, assert_raises
from nose import with_setup
from numpy.testing import assert_array_equal
import os
import shutil
import tempfile
import scipy as sp

tmpdir = None


def setup_tmpdata():
    # create temporary dir
    global tmpdir
    tmpdir = tempfile.mkdtemp()
    os.makedirs(os.path.join(tmpdir, 'mldata'))


def teardown_tmpdata():
    # remove temporary dir
    if tmpdir is not None:
        shutil.rmtree(tmpdir)


def test_mldata_filename():
    cases = [('datasets-UCI iris', 'datasets-uci-iris'),
             ('news20.binary', 'news20binary'),
             ('book-crossing-ratings-1.0', 'book-crossing-ratings-10'),
             ('Nile Water Level', 'nile-water-level'),
             ('MNIST (original)', 'mnist-original')]
    for name, desired in cases:
        assert_equal(mldata_filename(name), desired)


@with_setup(setup_tmpdata, teardown_tmpdata)
def test_download():
    """Test that fetch_mldata is able to download and cache a data set."""

    _urllib2_ref = datasets.mldata.urllib2
    datasets.mldata.urllib2 = mock_urllib2({'mock':
                                            {'label': sp.ones((150,)),
                                             'data': sp.ones((150, 4))}})
    try:
        mock = fetch_mldata('mock', data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.urllib2.HTTPError,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urllib2 = _urllib2_ref


@with_setup(setup_tmpdata, teardown_tmpdata)
def test_fetch_one_column():
    _urllib2_ref = datasets.mldata.urllib2
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urllib2 = mock_urllib2({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urllib2 = _urllib2_ref


@with_setup(setup_tmpdata, teardown_tmpdata)
def test_fetch_multiple_column():
    _urllib2_ref = datasets.mldata.urllib2
    try:
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        y = sp.array([1, -1])
        z = sp.arange(12).reshape(4, 3)

        # by default
        dataname = 'threecol-default'
        datasets.mldata.urllib2 = mock_urllib2({dataname:
                                                ({'label': y,
                                                  'data': x,
                                                  'z': z},
                                                 ['z', 'data', 'label'])})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "z"]:
            assert_in(n, dset)
        assert_not_in("x", dset)
        assert_not_in("y", dset)

        assert_array_equal(dset.data, x)
        assert_array_equal(dset.target, y)
        assert_array_equal(dset.z, z.T)

        # by order
        dataname = 'threecol-order'
        datasets.mldata.urllib2 = mock_urllib2({dataname:
                                                ({'y': y,
                                                  'x': x,
                                                  'z': z},
                                                 ['y', 'x', 'z'])})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "z"]:
            assert_in(n, dset)
        assert_not_in("x", dset)
        assert_not_in("y", dset)

        assert_array_equal(dset.data, x)
        assert_array_equal(dset.target, y)
        assert_array_equal(dset.z, z.T)

        # by number
        dataname = 'threecol-number'
        datasets.mldata.urllib2 = mock_urllib2({dataname:
                                                ({'y': y,
                                                  'x': x,
                                                  'z': z},
                                                 ['z', 'x', 'y'])})

        dset = fetch_mldata(dataname, target_name=2, data_name=0,
                            data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "x"]:
            assert_in(n, dset)
        assert_not_in("y", dset)
        assert_not_in("z", dset)

        assert_array_equal(dset.data, z)
        assert_array_equal(dset.target, y)

        # by name
        dset = fetch_mldata(dataname, target_name='y', data_name='z',
                            data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "x"]:
            assert_in(n, dset)
        assert_not_in("y", dset)
        assert_not_in("z", dset)

    finally:
        datasets.mldata.urllib2 = _urllib2_ref