1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
|
import os
import unittest
from tempfile import NamedTemporaryFile
import numpy as np
from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \
TimeVariable, Domain, Table
from Orange.data.io import TabReader, ExcelReader
from Orange.data.io_util import guess_data_type
from Orange.misc.collections import natural_sorted
class TestTableFilters(unittest.TestCase):
def test_guess_data_type_continuous(self):
# should be ContinuousVariable
valuemap, values, coltype = guess_data_type(list(range(1, 100)))
self.assertEqual(ContinuousVariable, coltype)
self.assertIsNone(valuemap)
np.testing.assert_array_equal(np.array(list(range(1, 100))), values)
valuemap, values, coltype = guess_data_type([1, 2, 3, 1, 2, 3])
self.assertEqual(ContinuousVariable, coltype)
self.assertIsNone(valuemap)
np.testing.assert_array_equal([1, 2, 3, 1, 2, 3], values)
valuemap, values, coltype = guess_data_type(
["1", "2", "3", "1", "2", "3"])
self.assertEqual(ContinuousVariable, coltype)
self.assertIsNone(valuemap)
np.testing.assert_array_equal([1, 2, 3, 1, 2, 3], values)
def test_guess_data_type_discrete(self):
# should be DiscreteVariable
valuemap, values, coltype = guess_data_type([1, 2, 1, 2])
self.assertEqual(DiscreteVariable, coltype)
self.assertEqual([1, 2], valuemap)
np.testing.assert_array_equal([1, 2, 1, 2], values)
valuemap, values, coltype = guess_data_type(["1", "2", "1", "2", "a"])
self.assertEqual(DiscreteVariable, coltype)
self.assertEqual(["1", "2", "a"], valuemap)
np.testing.assert_array_equal(['1', '2', '1', '2', 'a'], values)
# just below the threshold for string variable
in_values = list(map(lambda x: str(x) + "a", range(24))) + ["a"] * 76
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(DiscreteVariable, coltype)
self.assertEqual(natural_sorted(set(in_values)), valuemap)
np.testing.assert_array_equal(in_values, values)
def test_guess_data_type_string(self):
# should be StringVariable
# too many different values for discrete
in_values = list(map(lambda x: str(x) + "a", range(90)))
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(StringVariable, coltype)
self.assertIsNone(valuemap)
np.testing.assert_array_equal(in_values, values)
# more than len(values)**0.7
in_values = list(map(lambda x: str(x) + "a", range(25))) + ["a"] * 75
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(StringVariable, coltype)
self.assertIsNone(valuemap)
np.testing.assert_array_equal(in_values, values)
# more than 100 different values - exactly 101
# this is the case when len(values)**0.7 rule would vote for the
# DiscreteVariable
in_values = list(map(lambda x: str(x) + "a", range(100))) + ["a"] * 999
valuemap, values, coltype = guess_data_type(in_values)
self.assertEqual(StringVariable, coltype)
self.assertIsNone(valuemap)
np.testing.assert_array_equal(in_values, values)
def test_guess_data_type_time(self):
in_values = ["2019-10-10", "2019-10-10", "2019-10-10", "2019-10-01"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)
in_values = ["2019-10-10T12:08:51", "2019-10-10T12:08:51",
"2019-10-10T12:08:51", "2019-10-01T12:08:51"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)
in_values = ["2019-10-10 12:08:51", "2019-10-10 12:08:51",
"2019-10-10 12:08:51", "2019-10-01 12:08:51"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)
in_values = ["2019-10-10 12:08", "2019-10-10 12:08",
"2019-10-10 12:08", "2019-10-01 12:08"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(TimeVariable, coltype)
self.assertIsNone(valuemap)
def test_guess_data_type_values_order(self):
"""
Test if values are ordered naturally
"""
in_values = [
"something1", "something12", "something2", "something1",
"something20", "something1", "something2", "something12",
"something1", "something12"
]
res = ["something1", "something2", "something12", "something20"]
valuemap, _, coltype = guess_data_type(in_values)
self.assertEqual(DiscreteVariable, coltype)
self.assertListEqual(res, valuemap)
class TestWriters(unittest.TestCase):
def setUp(self):
self.domain = Domain([DiscreteVariable("a", values=tuple("xyz")),
ContinuousVariable("b", number_of_decimals=3)],
ContinuousVariable("c", number_of_decimals=0),
[StringVariable("d")])
self.data = Table.from_numpy(
self.domain,
np.array([[1, 0.5], [2, np.nan], [np.nan, 1.0625]]),
np.array([3, 1, 7]),
np.array([["foo", "bar", np.nan]], dtype=object).T
)
def test_write_tab(self):
with NamedTemporaryFile(suffix=".tab", delete=False) as f:
fname = f.name
try:
TabReader.write(fname, self.data)
with open(fname, encoding="utf-8") as f:
self.assertEqual(f.read().strip(), """
c\td\ta\tb
continuous\tstring\tx y z\tcontinuous
class\tmeta\t\t
3\tfoo\ty\t0.500
1\tbar\tz\t
7\t\t\t1.06250""".strip())
finally:
os.remove(fname)
def test_roundtrip_xlsx(self):
with NamedTemporaryFile(suffix=".xlsx", delete=False) as f:
fname = f.name
try:
ExcelReader.write(fname, self.data)
data = ExcelReader(fname).read()
np.testing.assert_equal(data.X, self.data.X)
np.testing.assert_equal(data.Y, self.data.Y)
np.testing.assert_equal(data.metas[:2], self.data.metas[:2])
self.assertEqual(data.metas[2, 0], "")
np.testing.assert_equal(data.domain, self.data.domain)
finally:
os.remove(fname)
if __name__ == "__main__":
unittest.main()
|