File: test_io.py

package info (click to toggle)
orange3 3.40.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,908 kB
  • sloc: python: 162,745; ansic: 622; makefile: 322; sh: 93; cpp: 77
file content (160 lines) | stat: -rw-r--r-- 6,591 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import unittest
from tempfile import NamedTemporaryFile

import numpy as np

from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
    TimeVariable, Domain, Table
from Orange.data.io import TabReader, ExcelReader
from Orange.data.io_util import guess_data_type
from Orange.misc.collections import natural_sorted


class TestTableFilters(unittest.TestCase):
    def test_guess_data_type_continuous(self):
        # should be ContinuousVariable
        valuemap, values, coltype = guess_data_type(list(range(1, 100)))
        self.assertEqual(ContinuousVariable, coltype)
        self.assertIsNone(valuemap)
        np.testing.assert_array_equal(np.array(list(range(1, 100))), values)

        valuemap, values, coltype = guess_data_type([1, 2, 3, 1, 2, 3])
        self.assertEqual(ContinuousVariable, coltype)
        self.assertIsNone(valuemap)
        np.testing.assert_array_equal([1, 2, 3, 1, 2, 3], values)

        valuemap, values, coltype = guess_data_type(
            ["1", "2", "3", "1", "2", "3"])
        self.assertEqual(ContinuousVariable, coltype)
        self.assertIsNone(valuemap)
        np.testing.assert_array_equal([1, 2, 3, 1, 2, 3], values)

    def test_guess_data_type_discrete(self):
        # should be DiscreteVariable
        valuemap, values, coltype = guess_data_type([1, 2, 1, 2])
        self.assertEqual(DiscreteVariable, coltype)
        self.assertEqual([1, 2], valuemap)
        np.testing.assert_array_equal([1, 2, 1, 2], values)

        valuemap, values, coltype = guess_data_type(["1", "2", "1", "2", "a"])
        self.assertEqual(DiscreteVariable, coltype)
        self.assertEqual(["1", "2", "a"], valuemap)
        np.testing.assert_array_equal(['1', '2', '1', '2', 'a'], values)

        # just below the threshold for string variable
        in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76
        valuemap, values, coltype = guess_data_type(in_values)
        self.assertEqual(DiscreteVariable, coltype)
        self.assertEqual(natural_sorted(set(in_values)), valuemap)
        np.testing.assert_array_equal(in_values, values)

    def test_guess_data_type_string(self):
        # should be StringVariable
        # too many different values for discrete
        in_values = list(map(lambda x: str(x) + "a", range(90)))
        valuemap, values, coltype = guess_data_type(in_values)
        self.assertEqual(StringVariable, coltype)
        self.assertIsNone(valuemap)
        np.testing.assert_array_equal(in_values, values)

        # more than len(values)**0.7
        in_values = list(map(lambda x: str(x) + "a", range(25))) + ["a"] * 75
        valuemap, values, coltype = guess_data_type(in_values)
        self.assertEqual(StringVariable, coltype)
        self.assertIsNone(valuemap)
        np.testing.assert_array_equal(in_values, values)

        # more than 100 different values - exactly 101
        # this is the case when len(values)**0.7 rule would vote for the
        # DiscreteVariable
        in_values = list(map(lambda x: str(x) + "a", range(100))) + ["a"] * 999
        valuemap, values, coltype = guess_data_type(in_values)
        self.assertEqual(StringVariable, coltype)
        self.assertIsNone(valuemap)
        np.testing.assert_array_equal(in_values, values)

    def test_guess_data_type_time(self):
        in_values = ["2019-10-10", "2019-10-10", "2019-10-10", "2019-10-01"]
        valuemap, _, coltype = guess_data_type(in_values)
        self.assertEqual(TimeVariable, coltype)
        self.assertIsNone(valuemap)

        in_values = ["2019-10-10T12:08:51", "2019-10-10T12:08:51",
                     "2019-10-10T12:08:51", "2019-10-01T12:08:51"]
        valuemap, _, coltype = guess_data_type(in_values)
        self.assertEqual(TimeVariable, coltype)
        self.assertIsNone(valuemap)

        in_values = ["2019-10-10 12:08:51", "2019-10-10 12:08:51",
                     "2019-10-10 12:08:51", "2019-10-01 12:08:51"]
        valuemap, _, coltype = guess_data_type(in_values)
        self.assertEqual(TimeVariable, coltype)
        self.assertIsNone(valuemap)

        in_values = ["2019-10-10 12:08", "2019-10-10 12:08",
                     "2019-10-10 12:08", "2019-10-01 12:08"]
        valuemap, _, coltype = guess_data_type(in_values)
        self.assertEqual(TimeVariable, coltype)
        self.assertIsNone(valuemap)

    def test_guess_data_type_values_order(self):
        """
        Test if values are ordered naturally
        """
        in_values = [
            "something1", "something12", "something2", "something1",
            "something20", "something1", "something2", "something12",
            "something1", "something12"
        ]
        res = ["something1", "something2", "something12", "something20"]
        valuemap, _, coltype = guess_data_type(in_values)
        self.assertEqual(DiscreteVariable, coltype)
        self.assertListEqual(res, valuemap)

class TestWriters(unittest.TestCase):
    def setUp(self):
        self.domain = Domain([DiscreteVariable("a", values=tuple("xyz")),
                         ContinuousVariable("b", number_of_decimals=3)],
                        ContinuousVariable("c", number_of_decimals=0),
                        [StringVariable("d")])
        self.data = Table.from_numpy(
            self.domain,
            np.array([[1, 0.5], [2, np.nan], [np.nan, 1.0625]]),
            np.array([3, 1, 7]),
            np.array([["foo", "bar", np.nan]], dtype=object).T
        )

    def test_write_tab(self):
        with NamedTemporaryFile(suffix=".tab", delete=False) as f:
            fname = f.name
        try:
            TabReader.write(fname, self.data)
            with open(fname, encoding="utf-8") as f:
                self.assertEqual(f.read().strip(), """
c\td\ta\tb
continuous\tstring\tx y z\tcontinuous
class\tmeta\t\t
3\tfoo\ty\t0.500
1\tbar\tz\t
7\t\t\t1.06250""".strip())
        finally:
            os.remove(fname)

    def test_roundtrip_xlsx(self):
        with NamedTemporaryFile(suffix=".xlsx", delete=False) as f:
            fname = f.name
        try:
            ExcelReader.write(fname, self.data)
            data = ExcelReader(fname).read()
            np.testing.assert_equal(data.X, self.data.X)
            np.testing.assert_equal(data.Y, self.data.Y)
            np.testing.assert_equal(data.metas[:2], self.data.metas[:2])
            self.assertEqual(data.metas[2, 0], "")
            np.testing.assert_equal(data.domain, self.data.domain)
        finally:
            os.remove(fname)


if __name__ == "__main__":
    unittest.main()