File: checkdata.py

package info (click to toggle)
libsvm 3.25%2Bds-1~exp1.1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 968 kB
  • sloc: java: 3,680; cpp: 3,146; ansic: 2,253; python: 1,359; makefile: 152; sh: 41
file content (108 lines) | stat: -rwxr-xr-x 3,074 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python

#
# A format checker for LIBSVM
#

#
# Copyright (c) 2007, Rong-En Fan
#
# All rights reserved.
#
# This program is distributed under the same license of the LIBSVM package.
#

from sys import argv, exit
import os.path

def err(line_no, msg):
    print("line {0}: {1}".format(line_no, msg))

# works like float() but does not accept nan and inf
def my_float(x):
    if x.lower().find("nan") != -1 or x.lower().find("inf") != -1:
        raise ValueError

    return float(x)

def main():
    if len(argv) != 2:
        print("Usage: {0} dataset".format(argv[0]))
        exit(1)

    dataset = argv[1]

    if not os.path.exists(dataset):
        print("dataset {0} not found".format(dataset))
        exit(1)

    line_no = 1
    error_line_count = 0
    for line in open(dataset, 'r'):
        line_error = False

        # each line must end with a newline character
        if line[-1] != '\n':
            err(line_no, "missing a newline character in the end")
            line_error = True

        nodes = line.split()

        # check label
        try:
            label = nodes.pop(0)

            if label.find(',') != -1:
                # multi-label format
                try:
                    for l in label.split(','):
                        l = my_float(l)
                except:
                    err(line_no, "label {0} is not a valid multi-label form".format(label))
                    line_error = True
            else:
                try:
                    label = my_float(label)
                except:
                    err(line_no, "label {0} is not a number".format(label))
                    line_error = True
        except:
            err(line_no, "missing label, perhaps an empty line?")
            line_error = True

        # check features
        prev_index = -1
        for i in range(len(nodes)):
            try:
                (index, value) =  nodes[i].split(':')

                index = int(index)
                value = my_float(value)

                # precomputed kernel's index starts from 0 and LIBSVM
                # checks it. Hence, don't treat index 0 as an error.
                if index < 0:
                    err(line_no, "feature index must be positive; wrong feature {0}".format(nodes[i]))
                    line_error = True
                elif index <= prev_index:
                    err(line_no, "feature indices must be in an ascending order, previous/current features {0} {1}".format(nodes[i-1], nodes[i]))
                    line_error = True
                prev_index = index
            except:
                err(line_no, "feature '{0}' not an <index>:<value> pair, <index> integer, <value> real number ".format(nodes[i]))
                line_error = True

        line_no += 1

        if line_error:
            error_line_count += 1

    if error_line_count > 0:
        print("Found {0} lines with error.".format(error_line_count))
        return 1
    else:
        print("No error.")
        return 0

if __name__ == "__main__":
    exit(main())