File: pygr_api.py

package info (click to toggle)
python-screed 1.1.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 820 kB
  • sloc: python: 3,356; makefile: 169; sh: 32; javascript: 16
file content (177 lines) | stat: -rw-r--r-- 4,906 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# Copyright (c) 2008, Michigan State University.

"""
A simple wrapper implementing a pygr-compatible SequenceDB based on screed.

There are two implementions:
 - ScreedSequenceDB
 - ScreedSequenceDB_ByIndex

ScreedSequenceDB uses the sequence name as the sequence ID, which
mimics the behavior of pygr's SequenceFileDB and is good for
small-to-medium sized collections of sequences.
ScreedSequenceDB_ByIndex uses the sequence's index (0...size of
database) as a sequence ID, rather than the sequence name; this is
much faster for databases with many, many sequences.

Unlike the normal seqdb, screed will load the entire sequence record
into memory on request, so it's not good for large sequences.

All screed records are guaranteed to have an 'index', a 'name', and a
'sequence' attribute; anything else is specific to the database writer
you use.  The raw screed record (which contains any other information)
is available under seqObj.record.

Note: the underlying screed database must already have been built with
fadbm or fqdbm.

CTB 3/20/09
"""

import UserDict

from screed import ScreedDB

from pygr.sequence import SequenceBase
from pygr.seqdb import SequenceDB
from pygr.sequtil import DNA_SEQTYPE

#


class ScreedSequence(SequenceBase):

    """Sequence implementation based on screed; stores screed record info.

    Attributes:
      - 'id' and 'db' are the standard pygr-ish name/database attrs.
      - 'record' is the screed 'record' object, containing name, etc.
      - 'name' is the record name, which can be the same as 'id' but
        can also be different (see ScreedSequenceDB_ByIndex).
      - 'seq' is the sequence.

    """

    def __init__(self, db, id):
        self.id = id
        SequenceBase.__init__(self)
        info = db.seqInfoDict[id]

        self.record = info.record
        self.name = info.record.name
        self.seq = info.record.sequence


class ScreedSequenceDB(SequenceDB):

    """SequenceDB implementation based on screed; retrieve seqs by name."""
    itemClass = ScreedSequence

    def __init__(self, filepath):
        self.filepath = filepath
        self.seqInfoDict = _ScreedSeqInfoDict_ByName(filepath)
        SequenceDB.__init__(self)

    def _set_seqtype(self):
        self._seqtype = DNA_SEQTYPE

    def __repr__(self):
        return "<%s '%s'>" % (self.__class__.__name__, self.filepath)

    # override inherited __reduce__/__getstate__/__setstate__ from SequenceDB.
    def __reduce__(self):
        return (ScreedSequenceDB, (self.filepath,))


class ScreedSequenceDB_ByIndex(SequenceDB):

    """SequenceDB implementation based on screed; retrieve seqs by index."""
    itemClass = ScreedSequence

    def __init__(self, filepath):
        self.filepath = filepath
        self.seqInfoDict = _ScreedSeqInfoDict_ByIndex(filepath)
        SequenceDB.__init__(self)

    def _set_seqtype(self):
        self._seqtype = DNA_SEQTYPE

    def __repr__(self):
        return "<%s '%s'>" % (self.__class__.__name__, self.filepath)

    # override inherited __reduce__/__getstate__/__setstate__ from SequenceDB.
    def __reduce__(self):
        return (ScreedSequenceDB_ByIndex, (self.filepath,))


class _ScreedSequenceInfo(object):

    """Objects to put in seqInfoDict values, for holding screed record info."""

    def __init__(self, id, record):
        self.id = id
        self.record = record
        self.length = len(record.sequence)


class _ScreedSeqInfoDict_ByName(object, UserDict.DictMixin):

    """seqInfoDict implementation that uses names to retrieve records."""

    def __init__(self, filepath):
        self.sdb = ScreedDB(filepath)

    def __getitem__(self, k):
        v = self.sdb[k]
        return _ScreedSequenceInfo(k, v)

    def keys(self):
        return self.sdb.keys()

    def itervalues(self):
        i = 0
        max_index = len(self.sdb)
        while i < max_index:
            v = self.sdb.loadRecordByIndex(i)
            yield _ScreedSequenceInfo(v.name, v)
            i += 1

    def iteritems(self):
        for v in self.itervalues():
            yield v.record.name, v


class _ScreedSeqInfoDict_ByIndex(object, UserDict.DictMixin):

    """seqInfoDict implementation that uses indices to retrieve records."""

    def __init__(self, filepath):
        self.sdb = ScreedDB(filepath)

    def __getitem__(self, k):
        n = int(k)
        v = self.sdb.loadRecordByIndex(n)
        return _ScreedSequenceInfo(k, v)

    def keys(self):
        return range(0, len(self.sdb))

    def iterkeys(self):
        i = 0
        max_index = len(self.sdb)
        while i < max_index:
            yield i
            i += 1


if __name__ == '__main__':
    import sys
    filename = sys.argv[1]

    db = ScreedSequenceDB(filename)
    for k in db:
        print(k, repr(db[k]), db[k].name)

    db = ScreedSequenceDB_ByIndex(filename)
    for k in db:
        print(k, repr(db[k]), db[k].name)