File: seq.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (145 lines) | stat: -rw-r--r-- 4,800 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Classes to support "biological sequence" files.

:Author: Bob Harris (rsharris@bx.psu.edu)
"""

# DNA reverse complement table

DNA_COMP = (
    "                                             -                  "
    " TVGH  CD  M KN   YSA BWXR       tvgh  cd  m kn   ysa bwxr      "
    "                                                                "
    "                                                                "
)


class SeqFile:
    """
    A biological sequence is a sequence of bytes or characters.  Usually these
    represent DNA (A,C,G,T), proteins, or some variation of those.

    class attributes:

        file:    file object containing the sequence
        revcomp: whether gets from this sequence should be reverse-complemented
                 False => no reverse complement
                 True  => (same as "-5'")
                 "maf" => (same as "-5'")
                 "+5'" => minus strand is from plus strand's 5' end (same as "-3'")
                 "+3'" => minus strand is from plus strand's 3' end (same as "-5'")
                 "-5'" => minus strand is from its 5' end (as per MAF file format)
                 "-3'" => minus strand is from its 3' end (as per genome browser,
                          but with origin-zero)
        name:    usually a species and/or chromosome name (e.g. "mule.chr5");  if
                 the file contains a name, that overrides this one
        gap:     gap character that aligners should use for gaps in this sequence
    """

    def __init__(self, file=None, revcomp=False, name="", gap=None):
        self.file = file
        if revcomp:
            self.revcomp = "-5'"
        elif revcomp == "+3'":
            self.revcomp = "-5'"
        elif revcomp == "+5'":
            self.revcomp = "-3'"
        elif revcomp == "maf":
            self.revcomp = "-5'"
        else:
            self.revcomp = revcomp
        self.name = name
        if gap is None:
            self.gap = "-"
        else:
            self.gap = gap

        self.text = None  # (subclasses fill in text and
        self.length = 0  # length or they most override get())

    def close(self):
        assert self.file is not None
        self.file.close()
        self.file = None

    def extract_name(self, line):
        try:
            return line.split()[0]
        except Exception:
            return ""

    def set_text(self, text):
        self.text = text
        self.length = len(text)

    def __str__(self):
        text = ""
        if self.name is not None:
            text += self.name + " "
        text += self.get(0, self.length)
        return text

    def get(self, start, length):
        """
        Fetch subsequence starting at position `start` with length `length`.
        This method is picky about parameters, the requested interval must
        have non-negative length and fit entirely inside the NIB sequence,
        the returned string will contain exactly 'length' characters, or an
        AssertionError will be generated.
        """
        # Check parameters
        assert length >= 0, "Length must be non-negative (got %d)" % length
        assert start >= 0, "Start must be greater than 0 (got %d)" % start
        assert (
            start + length <= self.length
        ), f"Interval beyond end of sequence ({start}..{start + length} > {self.length})"
        # Fetch sequence and reverse complement if necesary
        if not self.revcomp:
            return self.raw_fetch(start, length)
        if self.revcomp == "-3'":
            return self.reverse_complement(self.raw_fetch(start, length))
        assert self.revcomp == "-5'", "unrecognized reverse complement scheme"
        start = self.length - (start + length)
        return self.reverse_complement(self.raw_fetch(start, length))

    def raw_fetch(self, start, length):
        return self.text[start : start + length]

    def reverse_complement(self, text):
        return text.translate(DNA_COMP)[::-1]


class SeqReader:
    """Iterate over all sequences in a file in order"""

    def __init__(self, file, revcomp=False, name="", gap=None):
        self.file = file
        self.revcomp = revcomp
        self.name = name
        self.gap = gap
        self.seqs_read = 0

    def close(self):
        self.file.close()

    def __iter__(self):
        return SeqReaderIter(self)

    def __next__(self):
        # subclasses should override this method and return the next sequence
        # (of type SeqFile or a subclass) read from self.file
        return


class SeqReaderIter:
    def __init__(self, reader):
        self.reader = reader

    def __iter__(self):
        return self

    def __next__(self):
        v = next(self.reader)
        if not v:
            raise StopIteration
        return v