File: test_txt_reader.py

package info (click to toggle)
orange3 3.40.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,908 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (135 lines) | stat: -rw-r--r-- 3,545 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Test methods with long descriptive names can omit docstrings
# pylint: disable=missing-docstring
import unittest
from tempfile import NamedTemporaryFile
import os
import warnings

from Orange.data import Table, ContinuousVariable, DiscreteVariable
from Orange.data.io import CSVReader
from Orange.tests import test_filename, named_file

tab_file = """\
Feature 1\tFeature 2\tFeature 3
1.0      \t1.3        \t5
2.0      \t42        \t7
"""

csv_file = """\
Feature 1,   Feature 2,Feature 3
1.0,      1.3,       5
2.0,      42,        7
"""

tab_file_nh = """\
1.0      \t1.3        \t5
2.0      \t42        \t7
"""

csv_file_nh = """\
1.0,      1.3,       5
2.0,      42,        7
"""

noncont_marked_cont = '''\
a,b
d,c
,
e,1
f,g
'''


csv_file_missing = """\
A,B
1,A
2,B
3,A
?,B
5,?
"""


class TestTabReader(unittest.TestCase):
    def read_easy(self, s, name):
        file = NamedTemporaryFile("wt", delete=False)
        filename = file.name
        try:
            file.write(s)
            file.close()
            table = CSVReader(filename).read()

            f1, f2, f3 = table.domain.variables
            self.assertIsInstance(f1, DiscreteVariable)
            self.assertEqual(f1.name, name + "1")
            self.assertIsInstance(f2, ContinuousVariable)
            self.assertEqual(f2.name, name + "2")
            self.assertIsInstance(f3, ContinuousVariable)
            self.assertEqual(f3.name, name + "3")
        finally:
            os.remove(filename)

    def test_read_tab(self):
        self.read_easy(tab_file, "Feature ")
        self.read_easy(tab_file_nh, "Feature ")

    def test_read_csv(self):
        self.read_easy(csv_file, "Feature ")
        self.read_easy(csv_file_nh, "Feature ")

    def test_read_csv_with_na(self):
        with NamedTemporaryFile(mode="w", delete=False) as tmp:
            tmp.write(csv_file_missing)

        table = CSVReader(tmp.name).read()
        os.unlink(tmp.name)
        f1, f2 = table.domain.variables
        self.assertIsInstance(f1, ContinuousVariable)
        self.assertIsInstance(f2, DiscreteVariable)

    def test_read_nonutf8_encoding(self):
        with self.assertRaises(ValueError):
            with warnings.catch_warnings():
                warnings.filterwarnings('error')
                Table(test_filename('datasets/invalid_characters.tab'))

    def test_noncontinous_marked_continuous(self):
        file = NamedTemporaryFile("wt", delete=False)
        file.write(noncont_marked_cont)
        file.close()
        with self.assertRaises(ValueError) as cm:
            table = CSVReader(file.name).read()
        self.assertIn('line 5, column 2', cm.exception.args[0])

    def test_pr1734(self):
        ContinuousVariable('foo')
        file = NamedTemporaryFile("wt", delete=False)
        filename = file.name
        try:
            file.write('''\
foo
time

123123123
''')
            file.close()
            CSVReader(filename).read()
        finally:
            os.remove(filename)

    def test_csv_sniffer(self):
        # GH-2785
        reader = CSVReader(test_filename('datasets/test_asn_data_working.csv'))
        data = reader.read()
        self.assertEqual(len(data), 8)
        self.assertEqual(len(data.domain.variables) + len(data.domain.metas), 15)

    def test_utf_8_sig(self):
        with named_file(csv_file, encoding="utf-8-sig") as f:
            reader = CSVReader(f)
            data = reader.read()
            self.assertEqual(data.domain[0].name, "Feature 1")


if __name__ == "__main__":
    unittest.main()