File: io.py

package info (click to toggle)
python-airr 1.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 364 kB
  • sloc: python: 1,734; sh: 19; makefile: 10
file content (252 lines) | stat: -rw-r--r-- 7,940 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
"""
Reference library for AIRR schema for Ig/TCR rearrangements
"""
from __future__ import print_function
import sys
import csv
from airr.schema import RearrangementSchema, ValidationError


class RearrangementReader:
    """
    Iterator for reading Rearrangement objects in TSV format

    Attributes:
      fields (list): field names in the input Rearrangement file.
      external_fields (list): list of fields in the input file that are not
                              part of the Rearrangement definition.
    """
    @property
    def fields(self):
        """
        Get list of fields

        Returns:
          list : field names.
        """
        return self.dict_reader.fieldnames

    @property
    def external_fields(self):
        """
        Get list of field that are not in the Rearrangement schema

        Returns:
          list : field names.
        """
        return [f for f in self.dict_reader.fieldnames \
                if f not in self.schema.properties]

    def __init__(self, handle, base=1, validate=False, debug=False):
        """
        Initialization

        Arguments:
          handle (file): file handle of the open Rearrangement file.
          base (int): one of 0 or 1 specifying the coordinate schema in the input file.
                      If 1, then the file is assumed to contain 1-based closed intervals
                      that will be converted to python style 0-based half-open intervals
                      for known fields. If 0, then values will be unchanged.
          validate (bool): perform validation. If True then basic validation will be
                           performed will reading the data. A ValidationError exception
                           will be raised if an error is found.
          debug (bool): debug state. If True prints debug information.

        Returns:
          airr.io.RearrangementReader: reader object.
        """
        # arguments
        self.handle = handle
        self.base = base
        self.debug = debug
        self.validate = validate
        self.schema = RearrangementSchema

        # data reader, collect field names
        self.dict_reader = csv.DictReader(self.handle, dialect='excel-tab')

    def __iter__(self):
        """
        Iterator initializer

        Returns:
          airr.io.RearrangementReader
        """
        # Validate fields
        if (self.validate):
            self.schema.validate_header(self.dict_reader.fieldnames)

        return self

    def __next__(self):
        """
        Next method

        Returns:
          dict: parsed Rearrangement data.
        """
        try:
            row = next(self.dict_reader)
        except StopIteration:
            raise StopIteration

        for f in row:
            # row entry with no header
            if f is None:
                if self.validate:
                    raise ValidationError('row has extra data')
                else:
                    raise ValueError('row has extra data')

            # Convert types
            spec = self.schema.type(f)
            try:
                if spec == 'boolean':
                    row[f] = self.schema.to_bool(row[f], validate=self.validate)
                if spec == 'integer':
                    row[f] = self.schema.to_int(row[f], validate=self.validate)
                if spec == 'number':
                    row[f] = self.schema.to_float(row[f], validate=self.validate)
            except ValidationError as e:
                raise ValidationError('field %s has %s' %(f, e))

            # Adjust coordinates
            if f and f.endswith('_start') and self.base == 1:
                try:
                    row[f] = row[f] - 1
                except TypeError:
                    row[f] = None

        return row

    def close(self):
        """
        Closes the Rearrangement file
        """
        self.handle.close()

    def next(self):
        """
        Next method
        """
        return self.__next__()


class RearrangementWriter:
    """
    Writer class for Rearrangement objects in TSV format

    Attributes:
      fields (list): field names in the output Rearrangement file.
      external_fields (list): list of fields in the output file that are not
                              part of the Rearrangement definition.
    """
    @property
    def fields(self):
        """
        Get list of fields

        Returns:
          list : field names.
        """
        return self.dict_writer.fieldnames

    @property
    def external_fields(self):
        """
        Get list of field that are not in the Rearrangements schema

        Returns:
          list : field names.
        """
        return [f for f in self.dict_writer.fieldnames \
                if f not in self.schema.properties]

    def __init__(self, handle, fields=None, base=1, debug=False):
        """
        Initialization

        Arguments:
          handle (file): file handle of the open Rearrangements file.
          fields (list) : list of non-required fields to add. May include fields undefined by the schema.
          base (int): one of 0 or 1 specifying the coordinate schema in the output file.
                      Data provided to the write is assumed to be in python style 0-based
                      half-open intervals. If 1, then data will be converted to 1-based
                      closed intervals for known fields before writing. If 0, then values will be unchanged.
          debug (bool): debug state. If True prints debug information.

        Returns:
          airr.io.RearrangementWriter: writer object.
        """
        # arguments
        self.handle = handle
        self.base = base
        self.debug = debug
        self.schema = RearrangementSchema

        # order fields according to spec
        field_names = list(self.schema.required)
        if fields is not None:
            additional_fields = []
            for f in fields:
                if f in self.schema.required:
                    continue
                elif f in self.schema.optional:
                    field_names.append(f)
                else:
                    additional_fields.append(f)
            field_names.extend(additional_fields)

        # open writer and write header
        self.dict_writer = csv.DictWriter(self.handle, fieldnames=field_names, dialect='excel-tab',
                                          extrasaction='ignore', lineterminator='\n')
        self.dict_writer.writeheader()

    def close(self):
        """
        Closes the Rearrangement file
        """
        self.handle.close()

    def write(self, row):
        """
        Write a row to the Rearrangement file

        Arguments:
            row (dict): row to write.
        """
        # validate row
        if self.debug:
            for field in self.schema.required:
                if row.get(field, None) is None:
                    sys.stderr.write('Warning: Record is missing AIRR required field (' + field + ').\n')

        for f in row.keys():
            # Adjust coordinates
            if f.endswith('_start') and self.base == 1:
                try:
                    row[f] = self.schema.to_int(row[f]) + 1
                except TypeError:
                    row[f] = None

            # Convert types
            spec = self.schema.type(f)
            if spec == 'boolean':  row[f] = self.schema.from_bool(row[f])

        self.dict_writer.writerow(row)


# TODO: pandas validation need if we load with pandas directly
# def validate_df(df, airr_schema):
#     valid = True
#
#     # check required fields
#     missing_fields = set(airr_schema.required) - set(df.columns)
#     if len(missing_fields) > 0:
#         print('Warning: file is missing mandatory fields: {}'.format(', '.join(missing_fields)))
#         valid = False
#
#     if not valid:
#         raise ValueError('invalid AIRR data file')