File: __init__.py

package info (click to toggle)
python-agate 1.9.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,996 kB
  • sloc: python: 8,512; makefile: 126
file content (359 lines) | stat: -rw-r--r-- 12,974 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
"""
The :class:`.Table` object is the most important class in agate. Tables are
created by supplying row data, column names and subclasses of :class:`.DataType`
to the constructor. Once created, the data in a table **can not be changed**.
This concept is central to agate.

Instead of modifying the data, various methods can be used to create new,
derivative tables. For example, the :meth:`.Table.select` method creates a new
table with only the specified columns. The :meth:`.Table.where` method creates
a new table with only those rows that pass a test. And :meth:`.Table.order_by`
creates a sorted table. In all of these cases the output is a new :class:`.Table`
and the existing table remains unmodified.

Tables are not themselves iterable, but the columns of the table can be
accessed via :attr:`.Table.columns` and the rows via :attr:`.Table.rows`. Both
sequences can be accessed either by numeric index or by name. (In the case of
rows, row names are optional.)
"""

import sys
import warnings
from io import StringIO
from itertools import chain

from agate import utils
from agate.columns import Column
from agate.data_types import DataType
from agate.exceptions import CastError
from agate.mapped_sequence import MappedSequence
from agate.rows import Row
from agate.type_tester import TypeTester


class Table:
    """
    A dataset consisting of rows and columns. Columns refer to "vertical" slices
    of data that must all be of the same type. Rows refer to "horizontal" slices
    of data that may (and usually do) contain mixed types.

    The sequence of :class:`.Column` instances are retrieved via the
    :attr:`.Table.columns` property. They may be accessed by either numeric
    index or by unique column name.

    The sequence of :class:`.Row` instances are retrieved via the
    :attr:`.Table.rows` property. They may be accessed by either numeric index
    or, if specified, unique row names.

    :param rows:
        The data as a sequence of any sequences: tuples, lists, etc. If
        any row has fewer values than the number of columns, it will be filled
        out with nulls. No row may have more values than the number of columns.
    :param column_names:
        A sequence of string names for each column or `None`, in which case
        column names will be automatically assigned using :func:`.letter_name`.
    :param column_types:
        A sequence of instances of :class:`.DataType` or an instance of
        :class:`.TypeTester` or `None` in which case a generic TypeTester will
        be used. Alternatively, a dictionary with column names as keys and
        instances of :class:`.DataType` as values to specify some types.
    :param row_names:
        Specifies unique names for each row. This parameter is
        optional. If specified it may be 1) the name of a single column that
        contains a unique identifier for each row, 2) a key function that takes
        a :class:`.Row` and returns a unique identifier or 3) a sequence of
        unique identifiers of the same length as the sequence of rows. The
        uniqueness of resulting identifiers is not validated, so be certain
        the values you provide are truly unique.
    :param _is_fork:
        Used internally to skip certain validation steps when data
        is propagated from an existing table. When :code:`True`, rows are
        assumed to be :class:`.Row` instances, rather than raw data.
    """
    def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False):
        if isinstance(rows, str):
            raise ValueError('When created directly, the first argument to Table must be a sequence of rows. '
                             'Did you want agate.Table.from_csv?')

        # Validate column names
        if column_names:
            self._column_names = utils.deduplicate(column_names, column_names=True)
        elif rows:
            self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names),
                          RuntimeWarning, stacklevel=2)
        else:
            self._column_names = tuple()

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif isinstance(column_types, dict):
            for v in column_types.values():
                if not isinstance(v, DataType):
                    raise ValueError('Column types must be instances of DataType.')

            column_types = TypeTester(force=column_types)
        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError('Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError('column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)
                    )
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len_column_names - len_row))

                row_values = []
                for j, d in enumerate(row):
                    try:
                        row_values.append(cast_funcs[j](d))
                    except CastError as e:
                        raise CastError(str(e) + f' Error at row {i} column {self._column_names[j]}.')

                new_rows.append(Row(row_values, self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, str):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError('row_names must be a column name, function or sequence')

            for row_name in computed_row_names:
                if type(row_name) is int:
                    raise ValueError('Row names cannot be of type int. Use Decimal for numbered row names.')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i in range(len_column_names):
            name = self._column_names[i]
            data_type = self._column_types[i]

            column = Column(i, name, data_type, self._rows, row_names=self._row_names)

            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)

    def __str__(self):
        """
        Print the table's structure using :meth:`.Table.print_structure`.
        """
        structure = StringIO()

        self.print_structure(output=structure)

        return structure.getvalue()

    def __len__(self):
        """
        Shorthand for :code:`len(table.rows)`.
        """
        return self._rows.__len__()

    def __iter__(self):
        """
        Shorthand for :code:`iter(table.rows)`.
        """
        return self._rows.__iter__()

    def __getitem__(self, key):
        """
        Shorthand for :code:`table.rows[foo]`.
        """
        return self._rows.__getitem__(key)

    @property
    def column_types(self):
        """
        An tuple :class:`.DataType` instances.
        """
        return self._column_types

    @property
    def column_names(self):
        """
        An tuple of strings.
        """
        return self._column_names

    @property
    def row_names(self):
        """
        An tuple of strings, if this table has row names.

        If this table does not have row names, then :code:`None`.
        """
        return self._row_names

    @property
    def columns(self):
        """
        A :class:`.MappedSequence` with column names for keys and
        :class:`.Column` instances for values.
        """
        return self._columns

    @property
    def rows(self):
        """
        A :class:`.MappedSeqeuence` with row names for keys (if specified) and
        :class:`.Row` instances for values.
        """
        return self._rows

    def _fork(self, rows, column_names=None, column_types=None, row_names=None):
        """
        Create a new table using the metadata from this one.

        This method is used internally by functions like
        :meth:`.Table.order_by`.

        :param rows:
            Row data for the forked table.
        :param column_names:
            Column names for the forked table. If not specified, fork will use
            this table's column names.
        :param column_types:
            Column types for the forked table. If not specified, fork will use
            this table's column names.
        :param row_names:
            Row names for the forked table. If not specified, fork will use
            this table's row names.
        """
        if column_names is None:
            column_names = self._column_names

        if column_types is None:
            column_types = self._column_types

        if row_names is None:
            row_names = self._row_names

        return Table(rows, column_names, column_types, row_names=row_names, _is_fork=True)

    def print_csv(self, **kwargs):
        """
        Print this table as a CSV.

        This is the same as passing :code:`sys.stdout` to :meth:`.Table.to_csv`.

        :code:`kwargs` will be passed on to :meth:`.Table.to_csv`.
        """
        self.to_csv(sys.stdout, **kwargs)

    def print_json(self, **kwargs):
        """
        Print this table as JSON.

        This is the same as passing :code:`sys.stdout` to
        :meth:`.Table.to_json`.

        :code:`kwargs` will be passed on to :meth:`.Table.to_json`.
        """
        self.to_json(sys.stdout, **kwargs)


from agate.table.aggregate import aggregate
from agate.table.bar_chart import bar_chart
from agate.table.bins import bins
from agate.table.column_chart import column_chart
from agate.table.compute import compute
from agate.table.denormalize import denormalize
from agate.table.distinct import distinct
from agate.table.exclude import exclude
from agate.table.find import find
from agate.table.from_csv import from_csv
from agate.table.from_fixed import from_fixed
from agate.table.from_json import from_json
from agate.table.from_object import from_object
from agate.table.group_by import group_by
from agate.table.homogenize import homogenize
from agate.table.join import join
from agate.table.limit import limit
from agate.table.line_chart import line_chart
from agate.table.merge import merge
from agate.table.normalize import normalize
from agate.table.order_by import order_by
from agate.table.pivot import pivot
from agate.table.print_bars import print_bars
from agate.table.print_html import print_html
from agate.table.print_structure import print_structure
from agate.table.print_table import print_table
from agate.table.rename import rename
from agate.table.scatterplot import scatterplot
from agate.table.select import select
from agate.table.to_csv import to_csv
from agate.table.to_json import to_json
from agate.table.where import where

Table.aggregate = aggregate
Table.bar_chart = bar_chart
Table.bins = bins
Table.column_chart = column_chart
Table.compute = compute
Table.denormalize = denormalize
Table.distinct = distinct
Table.exclude = exclude
Table.find = find
Table.from_csv = from_csv
Table.from_fixed = from_fixed
Table.from_json = from_json
Table.from_object = from_object
Table.group_by = group_by
Table.homogenize = homogenize
Table.join = join
Table.limit = limit
Table.line_chart = line_chart
Table.merge = merge
Table.normalize = normalize
Table.order_by = order_by
Table.pivot = pivot
Table.print_bars = print_bars
Table.print_html = print_html
Table.print_structure = print_structure
Table.print_table = print_table
Table.rename = rename
Table.scatterplot = scatterplot
Table.select = select
Table.to_csv = to_csv
Table.to_json = to_json
Table.where = where