1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
|
"""
Reference library for AIRR schema for Ig/TCR rearrangements
"""
from __future__ import print_function
import sys
import csv
from airr.schema import RearrangementSchema, ValidationError
class RearrangementReader:
"""
Iterator for reading Rearrangement objects in TSV format
Attributes:
fields (list): field names in the input Rearrangement file.
external_fields (list): list of fields in the input file that are not
part of the Rearrangement definition.
"""
@property
def fields(self):
"""
Get list of fields
Returns:
list : field names.
"""
return self.dict_reader.fieldnames
@property
def external_fields(self):
"""
Get list of field that are not in the Rearrangement schema
Returns:
list : field names.
"""
return [f for f in self.dict_reader.fieldnames \
if f not in self.schema.properties]
def __init__(self, handle, base=1, validate=False, debug=False):
"""
Initialization
Arguments:
handle (file): file handle of the open Rearrangement file.
base (int): one of 0 or 1 specifying the coordinate schema in the input file.
If 1, then the file is assumed to contain 1-based closed intervals
that will be converted to python style 0-based half-open intervals
for known fields. If 0, then values will be unchanged.
validate (bool): perform validation. If True then basic validation will be
performed will reading the data. A ValidationError exception
will be raised if an error is found.
debug (bool): debug state. If True prints debug information.
Returns:
airr.io.RearrangementReader: reader object.
"""
# arguments
self.handle = handle
self.base = base
self.debug = debug
self.validate = validate
self.schema = RearrangementSchema
# data reader, collect field names
self.dict_reader = csv.DictReader(self.handle, dialect='excel-tab')
def __iter__(self):
"""
Iterator initializer
Returns:
airr.io.RearrangementReader
"""
# Validate fields
if (self.validate):
self.schema.validate_header(self.dict_reader.fieldnames)
return self
def __next__(self):
"""
Next method
Returns:
dict: parsed Rearrangement data.
"""
try:
row = next(self.dict_reader)
except StopIteration:
raise StopIteration
for f in row:
# row entry with no header
if f is None:
if self.validate:
raise ValidationError('row has extra data')
else:
raise ValueError('row has extra data')
# Convert types
spec = self.schema.type(f)
try:
if spec == 'boolean':
row[f] = self.schema.to_bool(row[f], validate=self.validate)
if spec == 'integer':
row[f] = self.schema.to_int(row[f], validate=self.validate)
if spec == 'number':
row[f] = self.schema.to_float(row[f], validate=self.validate)
except ValidationError as e:
raise ValidationError('field %s has %s' %(f, e))
# Adjust coordinates
if f and f.endswith('_start') and self.base == 1:
try:
row[f] = row[f] - 1
except TypeError:
row[f] = None
return row
def close(self):
"""
Closes the Rearrangement file
"""
self.handle.close()
def next(self):
"""
Next method
"""
return self.__next__()
class RearrangementWriter:
"""
Writer class for Rearrangement objects in TSV format
Attributes:
fields (list): field names in the output Rearrangement file.
external_fields (list): list of fields in the output file that are not
part of the Rearrangement definition.
"""
@property
def fields(self):
"""
Get list of fields
Returns:
list : field names.
"""
return self.dict_writer.fieldnames
@property
def external_fields(self):
"""
Get list of field that are not in the Rearrangements schema
Returns:
list : field names.
"""
return [f for f in self.dict_writer.fieldnames \
if f not in self.schema.properties]
def __init__(self, handle, fields=None, base=1, debug=False):
"""
Initialization
Arguments:
handle (file): file handle of the open Rearrangements file.
fields (list) : list of non-required fields to add. May include fields undefined by the schema.
base (int): one of 0 or 1 specifying the coordinate schema in the output file.
Data provided to the write is assumed to be in python style 0-based
half-open intervals. If 1, then data will be converted to 1-based
closed intervals for known fields before writing. If 0, then values will be unchanged.
debug (bool): debug state. If True prints debug information.
Returns:
airr.io.RearrangementWriter: writer object.
"""
# arguments
self.handle = handle
self.base = base
self.debug = debug
self.schema = RearrangementSchema
# order fields according to spec
field_names = list(self.schema.required)
if fields is not None:
additional_fields = []
for f in fields:
if f in self.schema.required:
continue
elif f in self.schema.optional:
field_names.append(f)
else:
additional_fields.append(f)
field_names.extend(additional_fields)
# open writer and write header
self.dict_writer = csv.DictWriter(self.handle, fieldnames=field_names, dialect='excel-tab',
extrasaction='ignore', lineterminator='\n')
self.dict_writer.writeheader()
def close(self):
"""
Closes the Rearrangement file
"""
self.handle.close()
def write(self, row):
"""
Write a row to the Rearrangement file
Arguments:
row (dict): row to write.
"""
# validate row
if self.debug:
for field in self.schema.required:
if row.get(field, None) is None:
sys.stderr.write('Warning: Record is missing AIRR required field (' + field + ').\n')
for f in row.keys():
# Adjust coordinates
if f.endswith('_start') and self.base == 1:
try:
row[f] = self.schema.to_int(row[f]) + 1
except TypeError:
row[f] = None
# Convert types
spec = self.schema.type(f)
if spec == 'boolean': row[f] = self.schema.from_bool(row[f])
self.dict_writer.writerow(row)
# TODO: pandas validation need if we load with pandas directly
# def validate_df(df, airr_schema):
# valid = True
#
# # check required fields
# missing_fields = set(airr_schema.required) - set(df.columns)
# if len(missing_fields) > 0:
# print('Warning: file is missing mandatory fields: {}'.format(', '.join(missing_fields)))
# valid = False
#
# if not valid:
# raise ValueError('invalid AIRR data file')
|