File: dataset.py

package info (click to toggle)
python-scipy 0.5.2-0.1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 33,888 kB
  • ctags: 44,231
  • sloc: ansic: 156,256; cpp: 90,347; python: 89,604; fortran: 73,083; sh: 1,318; objc: 424; makefile: 342
file content (194 lines) | stat: -rw-r--r-- 6,478 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import numpy as N

import libsvm

__all__ = [
    'LibSvmRegressionDataSet',
    'LibSvmClassificationDataSet',
    'LibSvmOneClassDataSet',
    'LibSvmTestDataSet'
    ]

class LibSvmDataSet:
    def __init__(self, data):
        self.data = data
        self.iddatamap = {}
        for y, x in data:
            key = x.__array_interface__['data'][0]
            self.iddatamap[key] = x
        self.__len__ = self.data.__len__
        self.__iter__ = self.data.__iter__

    def getgamma(self):
        maxlen = 0
        for y, x in self.data:
            maxlen = N.maximum(maxlen, x['index'].max())
        return 1.0 / maxlen
    gamma = property(getgamma, 'Gamma parameter for RBF kernel')

    def precompute(self, kernel):
        return LibSvmPrecomputedDataSet(kernel, self.data)

    def _create_svm_problem(self):
        return libsvm.create_svm_problem(self.data)

    def _update_svm_parameter(self, param):
        # XXX we can handle gamma=None here
        pass

class LibSvmPrecomputedDataSet:
    def __init__(self, kernel, origdata=None):
        self.kernel = kernel
        self.origdata = origdata
        if origdata is None: return

        self.iddatamap = {}

        # Create Gram matrix as a list of vectors which an extra entry
        # for the id field.
        n = len(origdata)
        grammat = [N.empty((n + 1,), dtype=libsvm.svm_node_dtype)
                   for i in range(n)]
        self.grammat = grammat

        # Calculate Gram matrix. Refer to Kernel::kernel_precomputed
        # in svm.cpp to see how this precomputed setup works.
        for i, (yi, xi) in enumerate(origdata):
            id = i + 1
            grammat[i][0] = 0, id
            # Map id to original vector so that we can find it again
            # after the model has been trained. libsvm essentially
            # provides the ids of the support vectors.
            self.iddatamap[id] = xi
            for j, (yj, xj) in enumerate(origdata[i:]):
                # Gram matrix is symmetric, so calculate dot product
                # once and store it in both required locations
                z = svm_node_dot(xi, xj, self.kernel)
                # fix index so we assign to the right place
                j += i
                grammat[i][j + 1] = 0, z
                grammat[j][i + 1] = 0, z

    def __len__(self):
        return len(self.origdata)

    def __getitem__(self, key):
        return self.iddatamap[key]

    def getdata(self):
        return zip(map(lambda x: x[0], self.origdata), self.grammat)
    data = property(getdata)

    def combine_inplace(self, dataset):
        """
        Combine this dataset with another dataset by calculating the
        new part of the Gram matrix in place.
        """
        # XXX N.resize is our friend here
        raise NotImplementedError

    def combine(self, dataset):
        """
        Combine this dataset with another dataset by extending the
        Gram matrix with the new inner products into a new matrix.
        """
        n = len(self.origdata) + len(dataset.data) + 1
        newgrammat = []

        # copy original Gram matrix
        for i in range(len(self.origdata)):
            newrow = N.zeros((n,), dtype=libsvm.svm_node_dtype)
            oldrow = self.grammat[i]
            newrow[:len(oldrow)] = oldrow
            newgrammat.append(newrow)

        # prepare Gram matrix for new data
        for i in range(len(dataset.data)):
            row = N.zeros((n,), dtype=libsvm.svm_node_dtype)
            newgrammat.append(row)

        newiddatamap = dict(self.iddatamap.items())
        m = len(self.origdata)
        for i, (yi, xi) in enumerate(dataset.data):
            i += m
            for j, (yj, xj) in enumerate(self.origdata):
                z = svm_node_dot(xi, xj, self.kernel)
                newgrammat[i][j + 1] = 0, z
                newgrammat[j][i + 1] = 0, z
        for i, (yi, xi) in enumerate(dataset.data):
            k = m + i
            id = k + 1
            newgrammat[k][0] = 0, id
            newiddatamap[id] = xi
            for j, (yj, xj) in enumerate(dataset.data[i:]):
                z = svm_node_dot(xi, xj, self.kernel)
                j += k
                newgrammat[k][j + 1] = 0, z
                newgrammat[j][k + 1] = 0, z

        newdataset = self.__class__(self.kernel)
        newdataset.origdata = self.origdata + dataset.data
        newdataset.iddatamap = newiddatamap
        newdataset.grammat = newgrammat
        return newdataset

    def _create_svm_problem(self):
        return libsvm.create_svm_problem(self.data)

    def _update_svm_parameter(self, param):
        param.kernel_type = libsvm.PRECOMPUTED

class LibSvmRegressionDataSet(LibSvmDataSet):
    def __init__(self, y, x):
        origdata = zip(y, x)
        data = [(x[0], convert_to_svm_node(x[1])) for x in origdata]
        LibSvmDataSet.__init__(self, data)

class LibSvmClassificationDataSet(LibSvmDataSet):
    def __init__(self, labels, x):
        origdata = zip(labels, x)
        data = [(x[0], convert_to_svm_node(x[1])) for x in origdata]
        LibSvmDataSet.__init__(self, data)

class LibSvmOneClassDataSet(LibSvmDataSet):
    def __init__(self, x):
        data = [(0, convert_to_svm_node(y)) for y in x]
        LibSvmDataSet.__init__(self, data)

class LibSvmTestDataSet:
    def __init__(self, data):
        self.data = data
        self.__len__ = self.data.__len__

    def __iter__(self):
        for x in self.data:
            yield convert_to_svm_node(x)

    def is_array_data(self):
        return isinstance(self.data, N.ndarray) and \
            self.data.dtype in N.sctypes['float']

def convert_to_svm_node(x):
    y = N.empty(len(x) + 1, dtype=libsvm.svm_node_dtype)
    y[-1] = -1, 0.
    if isinstance(x, dict):
        x = x.items()
    if isinstance(x, list):
        x.sort(cmp=lambda x,y: cmp(x[0],y[0]))
        y[:-1] = x
    else:
        y['index'][:-1] = N.arange(1,len(x) + 1)
        y['value'][:-1] = x
    assert N.alltrue(y[:-1]['index'] >= 1,axis=0), \
        'indexes must be positive'
    assert len(x) == len(N.unique(y[:-1]['index'])), \
        'indexes must be unique'
    return y

def svm_node_dot(x, y, kernel):
    maxlen = N.maximum(x['index'].max(), y['index'].max()) + 1
    tmpx = N.zeros((maxlen,), N.float64)
    tmpy = N.zeros((maxlen,), N.float64)
    tmpx[x['index'][:-1]] = x['value'][:-1]
    tmpy[y['index'][:-1]] = y['value'][:-1]
    return kernel(tmpx, tmpy)