File: type_tester.py

package info (click to toggle)
python-agate 1.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,008 kB
  • sloc: python: 8,578; makefile: 126
file content (131 lines) | stat: -rw-r--r-- 4,789 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import warnings
from copy import copy

from agate.data_types.base import DEFAULT_NULL_VALUES
from agate.data_types.boolean import Boolean
from agate.data_types.date import Date
from agate.data_types.date_time import DateTime
from agate.data_types.number import Number
from agate.data_types.text import Text
from agate.data_types.time_delta import TimeDelta


class TypeTester:
    """
    Control how data types are inferred for columns in a given set of data.

    This class is used by passing it to the :code:`column_types` argument of
    the :class:`.Table` constructor, or the same argument for any other method
    that create a :class:`.Table`

    Type inference can be a slow process. To limit the number of rows of data to
    be tested, pass the :code:`limit` argument. Note that may cause errors if
    your data contains different types of values after the specified number of
    rows.

    By default, data types will be tested against each column in this order:

    1. :class:`.Boolean`
    2. :class:`.Number`
    3. :class:`.TimeDelta`
    #. :class:`.Date`
    #. :class:`.DateTime`
    #. :class:`.Text`

    Individual types may be specified using the :code:`force` argument. The type
    order by be changed, or entire types disabled, by using the :code:`types`
    argument. Beware that changing the order of the types may cause unexpected
    behavior.

    :param force:
        A dictionary where each key is a column name and each value is a
        :class:`.DataType` instance that overrides inference.
    :param limit:
        An optional limit on how many rows to evaluate before selecting the
        most likely type. Note that applying a limit may mean errors arise when
        the data is cast--if the guess is proved incorrect in further rows of
        data.
    :param types:
        A sequence of possible types to test against. This be used to specify
        what data formats you want to test against. For instance, you may want
        to exclude :class:`TimeDelta` from testing. It can also be used to pass
        options such as ``locale`` to :class:`.Number` or ``cast_nulls`` to
        :class:`.Text`. Take care in specifying the order of the list. It is
        the order they are tested in. :class:`.Text` should always be last.
    :param null_values:
        If :code:`types` is :code:`None`, a sequence of values which should be
        cast to :code:`None` when encountered by the default data types.
    """
    def __init__(self, force={}, limit=None, types=None, null_values=DEFAULT_NULL_VALUES):
        self._force = force
        self._limit = limit

        if types:
            self._possible_types = types
        else:
            # In order of preference
            self._possible_types = [
                Boolean(null_values=null_values),
                Number(null_values=null_values),
                TimeDelta(null_values=null_values),
                Date(null_values=null_values),
                DateTime(null_values=null_values),
                Text(null_values=null_values)
            ]

    def run(self, rows, column_names):
        """
        Apply type inference to the provided data and return an array of
        column types.

        :param rows:
            The data as a sequence of any sequences: tuples, lists, etc.
        """
        num_columns = len(column_names)
        hypotheses = [set(self._possible_types) for i in range(num_columns)]
        force_indices = []

        for name in self._force.keys():
            try:
                force_indices.append(column_names.index(name))
            except ValueError:
                warnings.warn('"%s" does not match the name of any column in this table.' % name, RuntimeWarning)

        if self._limit:
            sample_rows = rows[:self._limit]
        elif self._limit == 0:
            text = Text()
            return tuple([text] * num_columns)
        else:
            sample_rows = rows

        for row in sample_rows:
            for i in range(num_columns):
                if i in force_indices:
                    continue

                h = hypotheses[i]

                if len(h) == 1:
                    continue

                for column_type in copy(h):
                    if len(row) > i and not column_type.test(row[i]):
                        h.remove(column_type)

        column_types = []

        for i in range(num_columns):
            if i in force_indices:
                column_types.append(self._force[column_names[i]])
                continue

            h = hypotheses[i]

            # Select in prefer order
            for t in self._possible_types:
                if t in h:
                    column_types.append(t)
                    break

        return tuple(column_types)