1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
|
"""
Classes to support "biological sequence" files.
:Author: Bob Harris (rsharris@bx.psu.edu)
"""
# DNA reverse complement table
DNA_COMP = (
" - "
" TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr "
" "
" "
)
class SeqFile:
"""
A biological sequence is a sequence of bytes or characters. Usually these
represent DNA (A,C,G,T), proteins, or some variation of those.
class attributes:
file: file object containing the sequence
revcomp: whether gets from this sequence should be reverse-complemented
False => no reverse complement
True => (same as "-5'")
"maf" => (same as "-5'")
"+5'" => minus strand is from plus strand's 5' end (same as "-3'")
"+3'" => minus strand is from plus strand's 3' end (same as "-5'")
"-5'" => minus strand is from its 5' end (as per MAF file format)
"-3'" => minus strand is from its 3' end (as per genome browser,
but with origin-zero)
name: usually a species and/or chromosome name (e.g. "mule.chr5"); if
the file contains a name, that overrides this one
gap: gap character that aligners should use for gaps in this sequence
"""
def __init__(self, file=None, revcomp=False, name="", gap=None):
self.file = file
if revcomp:
self.revcomp = "-5'"
elif revcomp == "+3'":
self.revcomp = "-5'"
elif revcomp == "+5'":
self.revcomp = "-3'"
elif revcomp == "maf":
self.revcomp = "-5'"
else:
self.revcomp = revcomp
self.name = name
if gap is None:
self.gap = "-"
else:
self.gap = gap
self.text = None # (subclasses fill in text and
self.length = 0 # length or they most override get())
def close(self):
assert self.file is not None
self.file.close()
self.file = None
def extract_name(self, line):
try:
return line.split()[0]
except Exception:
return ""
def set_text(self, text):
self.text = text
self.length = len(text)
def __str__(self):
text = ""
if self.name is not None:
text += self.name + " "
text += self.get(0, self.length)
return text
def get(self, start, length):
"""
Fetch subsequence starting at position `start` with length `length`.
This method is picky about parameters, the requested interval must
have non-negative length and fit entirely inside the NIB sequence,
the returned string will contain exactly 'length' characters, or an
AssertionError will be generated.
"""
# Check parameters
assert length >= 0, "Length must be non-negative (got %d)" % length
assert start >= 0, "Start must be greater than 0 (got %d)" % start
assert (
start + length <= self.length
), f"Interval beyond end of sequence ({start}..{start + length} > {self.length})"
# Fetch sequence and reverse complement if necesary
if not self.revcomp:
return self.raw_fetch(start, length)
if self.revcomp == "-3'":
return self.reverse_complement(self.raw_fetch(start, length))
assert self.revcomp == "-5'", "unrecognized reverse complement scheme"
start = self.length - (start + length)
return self.reverse_complement(self.raw_fetch(start, length))
def raw_fetch(self, start, length):
return self.text[start : start + length]
def reverse_complement(self, text):
return text.translate(DNA_COMP)[::-1]
class SeqReader:
"""Iterate over all sequences in a file in order"""
def __init__(self, file, revcomp=False, name="", gap=None):
self.file = file
self.revcomp = revcomp
self.name = name
self.gap = gap
self.seqs_read = 0
def close(self):
self.file.close()
def __iter__(self):
return SeqReaderIter(self)
def __next__(self):
# subclasses should override this method and return the next sequence
# (of type SeqFile or a subclass) read from self.file
return
class SeqReaderIter:
def __init__(self, reader):
self.reader = reader
def __iter__(self):
return self
def __next__(self):
v = next(self.reader)
if not v:
raise StopIteration
return v
|