1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
|
# kate: syntax Python;
# cython: profile=False, emit_code_comments=False
from __future__ import print_function, division, absolute_import
from xopen import xopen
from .seqio import _shorten, FormatError, SequenceReader
cdef class Sequence(object):
"""
A record in a FASTQ file. Also used for FASTA (then the qualities attribute
is None). qualities is a string and it contains the qualities encoded as
ascii(qual+33).
If an adapter has been matched to the sequence, the 'match' attribute is
set to the corresponding Match instance.
"""
cdef:
public str name
public str sequence
public str qualities
public bint second_header
public object match
public object match_info
def __init__(self, str name, str sequence, str qualities=None, bint second_header=False, match=None,
match_info=None):
"""Set qualities to None if there are no quality values"""
self.name = name
self.sequence = sequence
self.qualities = qualities
self.second_header = second_header
self.match = match
self.match_info = match_info
if qualities is not None and len(qualities) != len(sequence):
rname = _shorten(name)
raise FormatError("In read named {0!r}: length of quality sequence ({1}) and length "
"of read ({2}) do not match".format(
rname, len(qualities), len(sequence)))
def __getitem__(self, key):
"""slicing"""
return self.__class__(
self.name,
self.sequence[key],
self.qualities[key] if self.qualities is not None else None,
self.second_header,
self.match,
self.match_info)
def __repr__(self):
qstr = ''
if self.qualities is not None:
qstr = ', qualities={0!r}'.format(_shorten(self.qualities))
return '<Sequence(name={0!r}, sequence={1!r}{2})>'.format(_shorten(self.name), _shorten(self.sequence), qstr)
def __len__(self):
return len(self.sequence)
def __richcmp__(self, other, int op):
if 2 <= op <= 3:
eq = self.name == other.name and \
self.sequence == other.sequence and \
self.qualities == other.qualities
if op == 2:
return eq
else:
return not eq
else:
raise NotImplementedError()
def __reduce__(self):
return (Sequence, (self.name, self.sequence, self.qualities, self.second_header))
class FastqReader(SequenceReader):
"""
Reader for FASTQ files. Does not support multi-line FASTQ files.
"""
def __init__(self, file, sequence_class=Sequence):
"""
file is a filename or a file-like object.
If file is a filename, then .gz files are supported.
"""
super(FastqReader, self).__init__(file)
self.sequence_class = sequence_class
self.delivers_qualities = True
def __iter__(self):
"""
Yield Sequence objects
"""
cdef int i = 0
cdef int strip
cdef str line, name, qualities, sequence, name2
sequence_class = self.sequence_class
it = iter(self._file)
line = next(it)
if not (line and line[0] == '@'):
raise FormatError("Line {0} in FASTQ file is expected to start with '@', but found {1!r}".format(i+1, line[:10]))
strip = -2 if line.endswith('\r\n') else -1
name = line[1:strip]
i = 1
for line in it:
if i == 0:
if not (line and line[0] == '@'):
raise FormatError("Line {0} in FASTQ file is expected to start with '@', but found {1!r}".format(i+1, line[:10]))
name = line[1:strip]
elif i == 1:
sequence = line[:strip]
elif i == 2:
if line == '+\n': # check most common case first
name2 = ''
else:
line = line[:strip]
if not (line and line[0] == '+'):
raise FormatError("Line {0} in FASTQ file is expected to start with '+', but found {1!r}".format(i+1, line[:10]))
if len(line) > 1:
if not line[1:] == name:
raise FormatError(
"At line {0}: Sequence descriptions in the FASTQ file don't match "
"({1!r} != {2!r}).\n"
"The second sequence description must be either empty "
"or equal to the first description.".format(i+1,
name, line[1:]))
second_header = True
else:
second_header = False
elif i == 3:
if len(line) == len(sequence) - strip:
qualities = line[:strip]
else:
qualities = line.rstrip('\r\n')
yield sequence_class(name, sequence, qualities, second_header=second_header)
i = (i + 1) % 4
if i != 0:
raise FormatError("FASTQ file ended prematurely")
|