File: _format.py

package info (click to toggle)
q2-sample-classifier 2024.5.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 1,732 kB
  • sloc: python: 5,060; makefile: 41; sh: 13
file content (183 lines) | stat: -rw-r--r-- 6,871 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# ----------------------------------------------------------------------------
# Copyright (c) 2017-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import tarfile
import json

import qiime2.plugin.model as model
from qiime2.plugin import ValidationError


def _validate_record_len(cells, current_line_number, exp_len):
    if len(cells) != exp_len:
        raise ValidationError(
            "Expected data record to be TSV with {0} "
            "fields. Detected {1} fields at line {2}:\n\n{3!r}"
            .format(exp_len, len(cells), current_line_number, cells))


def _validate_file_not_empty(has_data):
    if not has_data:
        raise ValidationError(
            "There must be at least one data record present in the "
            "file in addition to the header line.")


class BooleanSeriesFormat(model.TextFileFormat):
    def _validate_(self, level):
        n_records = {'min': 5, 'max': None}[level]
        with self.open() as fh:
            # validate header
            # for now we will not validate any information in the header.
            line = fh.readline()

            # validate body
            has_data = False
            for line_number, line in enumerate(fh, start=2):
                cells = line.strip().split('\t')
                _validate_record_len(cells, line_number, 2)
                if str(cells[1]) not in ('True', 'False'):
                    raise ValidationError(
                        "Expected data to be comprised of values `True` and "
                        "`False`, found {0} at line {1}."
                        .format(str(cells[1]), line_number))
                has_data = True
                if n_records is not None and (line_number - 1) >= n_records:
                    break

            _validate_file_not_empty(has_data)


BooleanSeriesDirectoryFormat = model.SingleFileDirectoryFormat(
    'BooleanSeriesDirectoryFormat', 'outliers.tsv',
    BooleanSeriesFormat)


# This is effectively an internal format - it isn't registered with the
# plugin, but rather used as part of a dir fmt. This format also exists
# in q2-feature-classifier.
class PickleFormat(model.BinaryFileFormat):
    def _validate_(self, level):
        if not tarfile.is_tarfile(str(self)):
            raise ValidationError(
                "Unable to load pickled file (not a tar file).")


# https://github.com/qiime2/q2-types/issues/49
# This is effectively an internal format - it isn't registered with the
# plugin, but rather used as part of a dir fmt. This format also exists
# in q2-feature-classifier.
class JSONFormat(model.TextFileFormat):
    def _validate_(self, level):
        with self.open() as fh:
            try:
                json.load(fh)
            except json.JSONDecodeError as e:
                raise ValidationError(e)


class SampleEstimatorDirFmt(model.DirectoryFormat):
    version_info = model.File('sklearn_version.json', format=JSONFormat)
    sklearn_pipeline = model.File('sklearn_pipeline.tar', format=PickleFormat)


class PredictionsFormat(model.TextFileFormat):
    def _validate(self, n_records=None):
        with self.open() as fh:
            # validate header
            # for now we will not validate any information in the header,
            # since the name of the predicted column should be flexible. The
            # header name written by methods in q2-sample-classifier will be
            # "predicted-*", but this should also accommodate user-defined
            # column names.
            line = fh.readline()

            # validate body
            has_data = False
            for line_number, line in enumerate(fh, start=2):
                # we want to strip each cell, not the original line
                # otherwise empty cells are dropped, causing a TypeError
                cells = [c.strip() for c in line.split('\t')]
                _validate_record_len(cells, line_number, 2)
                has_data = True
                if n_records is not None and (line_number - 1) >= n_records:
                    break

            _validate_file_not_empty(has_data)

    def _validate_(self, level):
        record_count_map = {'min': 5, 'max': None}
        self._validate(record_count_map[level])


PredictionsDirectoryFormat = model.SingleFileDirectoryFormat(
    'PredictionsDirectoryFormat', 'predictions.tsv',
    PredictionsFormat)


class _MultiColumnNumericFormat(model.TextFileFormat):
    def _validate(self, n_records=None):
        with self.open() as fh:
            # validate header
            # for now we will not validate any information in the header,
            # since column names, count etc are frequently unique to individual
            # estimators. Let's keep this flexible.
            line = fh.readline()

            # validate body
            has_data = False
            for line_number, line in enumerate(fh, start=2):
                # we want to strip each cell, not the original line
                # otherwise empty cells are dropped, causing a TypeError
                cells = [c.strip() for c in line.split('\t')]
                if len(cells) < 2:
                    raise ValidationError(
                        "Expected data record to be TSV with two or more "
                        "fields. Detected {0} fields at line {1}:\n\n{2!r}"
                        .format(len(cells), line_number, cells))
                # all values (except row name) should be numbers
                try:
                    [float(c) for c in cells[1:]]
                except ValueError:
                    raise ValidationError(
                        "Columns must contain only numeric values. "
                        "A non-numeric value ({0!r}) was detected at line "
                        "{1}.".format(cells[1], line_number))

                has_data = True
                if n_records is not None and (line_number - 1) >= n_records:
                    break

            _validate_file_not_empty(has_data)

    def _validate_(self, level):
        record_count_map = {'min': 5, 'max': None}
        self._validate(record_count_map[level])


class ImportanceFormat(_MultiColumnNumericFormat):
    pass


ImportanceDirectoryFormat = model.SingleFileDirectoryFormat(
    'ImportanceDirectoryFormat', 'importance.tsv',
    ImportanceFormat)


class ProbabilitiesFormat(_MultiColumnNumericFormat):
    pass


ProbabilitiesDirectoryFormat = model.SingleFileDirectoryFormat(
    'ProbabilitiesDirectoryFormat', 'class_probabilities.tsv',
    ProbabilitiesFormat)


TrueTargetsDirectoryFormat = model.SingleFileDirectoryFormat(
    'TrueTargetsDirectoryFormat', 'true_targets.tsv',
    PredictionsFormat)