File: classification.py

package info (click to toggle)
python-scipy 0.6.0-12
  • links: PTS, VCS
  • area: main
  • in suites: lenny
  • size: 32,016 kB
  • ctags: 46,675
  • sloc: cpp: 124,854; ansic: 110,614; python: 108,664; fortran: 76,260; objc: 424; makefile: 384; sh: 10
file content (193 lines) | stat: -rw-r--r-- 6,933 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from ctypes import POINTER, c_int, c_double
from itertools import izip, repeat, chain
import numpy as N

from model import LibSvmModel
import libsvm

__all__ = [
    'LibSvmCClassificationModel',
    'LibSvmNuClassificationModel',
    'LibSvmClassificationResults'
    ]

class LibSvmClassificationResults:
    def __init__(self, model, traindataset, kernel, PredictorType):
        modelc = model.contents
        if modelc.param.svm_type not in [libsvm.C_SVC, libsvm.NU_SVC]:
            raise TypeError, '%s is for classification problems' % \
                str(self.__class__)
        self.nr_class = modelc.nr_class
        self.labels = modelc.labels[:self.nr_class]
        nrho = self.nr_class * (self.nr_class - 1) / 2
        self.rho = modelc.rho[:nrho]
        self.nSV = modelc.nSV[:self.nr_class]
        sv_coef = N.empty((self.nr_class - 1, modelc.l), dtype=N.float64)
        for i, c in enumerate(modelc.sv_coef[:self.nr_class - 1]):
            sv_coef[i,:] = c[:modelc.l]
        self.sv_coef = sv_coef
        self.predictor = PredictorType(model, traindataset, kernel)

    def predict(self, dataset):
        """
        This function does classification on a test vector x and
        returns the label of the predicted class.
        """
        if self.predictor.is_compact and dataset.is_array_data():
            return [int(x) for x in
                    self.predictor.predict(dataset.data)]
        else:
            return [int(self.predictor.predict(x)) for x in dataset]

    def predict_values(self, dataset):
        """
        This function does classification on a test dataset and
        returns decision values.

        For training data with nr_class classes, this function returns
        nr_class*(nr_class-1)/2 decision values in a dictionary for
        each item in the test dataset. The keys of the dictionary are
        2-tuples, one for each permutation of two class labels.
        """
        n = self.nr_class * (self.nr_class - 1) / 2
        def p(vv):
            vv = N.atleast_1d(vv)
            d = {}
            labels = self.labels
            for v, (li, lj) in \
                    izip(vv, chain(*[izip(repeat(x), labels[i+1:])
                                     for i, x in enumerate(labels[:-1])])):
                d[li, lj] = v
                d[lj, li] = -v
            return d
        if self.predictor.is_compact and dataset.is_array_data():
            vs = self.predictor.predict_values(dataset.data, n)
        else:
            vs = [self.predictor.predict_values(x, n) for x in dataset]
        return [p(v) for v in vs]

    def predict_probability(self, dataset):
        """
        This function does classification on a test dataset for a
        model with probability information.

        This function returns a list of 2-tuples. The first item in
        each tuple is the label of the class with the highest
        probability. The second item is a dictionary that associated
        labels with class probabilities.
        """
        def p(x):
            n = self.nr_class
            label, prob_estimates = \
                self.predictor.predict_probability(x, self.nr_class)
            return int(label), prob_estimates
        return [p(x) for x in dataset]

    def compact(self):
        self.predictor.compact()

class LibSvmClassificationModel(LibSvmModel):
    """
    A model for support vector classification.

    Classification models can predict a class label, decision values
    over all classes or a posterior class probability.

    See also:

    - Platt. Probabilistic Outputs for Support Vector Machines and
      Comparisons to Regularized Likelihood Methods.
    - Lin. A Note on Platt's Probabilistic Outputs for Support Vector
      Machines.
    """

    ResultsType = LibSvmClassificationResults

    def __init__(self, kernel, weights, **kwargs):
        LibSvmModel.__init__(self, kernel, **kwargs)
        if weights is not None:
            self.weight_labels = N.empty((len(weights),), dtype=N.intp)
            self.weights = N.empty((len(weights),), dtype=N.float64)
            weights = weights[:]
            weights.sort()
            for i, (label, weight) in enumerate(weights):
                self.weight_labels[i] = label
                self.weights[i] = weight
            self.param.nr_weight = len(weights)
            self.param.weight_label = \
                self.weight_labels.ctypes.data_as(POINTER(c_int))
            self.param.weight = \
                self.weights.ctypes.data_as(POINTER(c_double))

    def cross_validate(self, dataset, nr_fold):
        """
        Perform stratified cross-validation to determine the
        suitability of chosen model parameters.

        Data are separated to nr_fold folds. Each fold is validated
        against a model trained using the data from the remaining
        (nr_fold-1) folds.

        This function returns the percentage of data that was
        classified correctly over all the experiments.
        """
        problem = dataset._create_svm_problem()
        target = N.empty((len(dataset.data),), dtype=N.float64)
        tp = target.ctypes.data_as(POINTER(c_double))
        libsvm.svm_cross_validation(problem, self.param, nr_fold, tp)
        total_correct = 0.
        for x, t in zip(dataset.data, target):
            if x[0] == int(t):
                total_correct += 1
        # XXX also return results from folds in a list
        return 100.0 * total_correct / len(dataset.data)

class LibSvmCClassificationModel(LibSvmClassificationModel):
    """
    A model for C-SV classification.

    See also:

    - Hsu, et al. A Practical Guide to Support Vector Classification.
    - Gunn. Support Vector Machines for Classification and Regression.
    - Burges. A Tutorial on Support Vector Machines for Pattern
      Recognition.
    """

    def __init__(self, kernel,
                 cost=1.0, weights=None, probability=False, **kwargs):
        """
        Parameters:

        - `cost`: XXX
        - `weights`: XXX
        """
        LibSvmClassificationModel.__init__(self, kernel, weights, **kwargs)
        self.cost = cost
        self.param.svm_type = libsvm.C_SVC
        self.param.C = cost
        self.param.probability = probability

class LibSvmNuClassificationModel(LibSvmClassificationModel):
    """
    A model for nu-SV classification.

    See also:

    - Chen, et al. A Tutorial on nu-Support Vector Machines.
    - Scholkopf, et al. New Support Vector Algorithms.
    """

    def __init__(self, kernel,
                 nu=0.5, weights=None, probability=False, **kwargs):
        """
        Parameters:

        - `nu`: XXX
        - `weights`: XXX
        """
        LibSvmClassificationModel.__init__(self, kernel, weights, **kwargs)
        self.nu = nu
        self.param.svm_type = libsvm.NU_SVC
        self.param.nu = nu
        self.param.probability = probability