File: Index.py

package info (click to toggle)
python-biopython 1.64%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 44,416 kB
  • ctags: 12,472
  • sloc: python: 153,759; xml: 67,286; ansic: 9,003; sql: 1,488; makefile: 144; sh: 59
file content (147 lines) | stat: -rw-r--r-- 4,945 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Copyright 1999 by Jeffrey Chang.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Index.py

This module provides a way to create indexes to text files.

Classes:
Index     Dictionary-like class used to store index information.

_ShelveIndex    An Index class based on the shelve module.
_InMemoryIndex  An in-memory Index class.

"""
import os
import array
import shelve

try:
    import cPickle as pickle # Only available under Python 2
except ImportError:
    import pickle # Python 3

class _ShelveIndex(dict):
    """An index file wrapped around shelve.

    """
    # Without a good dbm module installed, this is pretty slow and
    # generates large files.  When generating an index on a FASTA-
    # formatted file with 82000 sequences (37Mb), the
    # index 'dat' file is 42Mb and 'dir' file is 8Mb.

    __version = 2
    __version_key = '__version'

    def __init__(self, indexname, truncate=None):
        dict.__init__(self)
        try:
            if truncate:
                # In python 1.52 and before, dumbdbm (under shelve)
                # doesn't clear the old database.
                files = [indexname + '.dir',
                         indexname + '.dat',
                         indexname + '.bak'
                         ]
                for file in files:
                    if os.path.exists(file):
                        os.unlink(file)
                raise Exception("open a new shelf")
            self.data = shelve.open(indexname, flag='r')
        except:
            # No database exists.
            self.data = shelve.open(indexname, flag='n')
            self.data[self.__version_key] = self.__version
        else:
            # Check to make sure the database is the correct version.
            version = self.data.get(self.__version_key, None)
            if version is None:
                raise IOError("Unrecognized index format")
            elif version != self.__version:
                raise IOError("Version %s doesn't match my version %s"
                              % (version, self.__version))

    def __del__(self):
        if 'data' in self.__dict__:
            self.data.close()


class _InMemoryIndex(dict):
    """This creates an in-memory index file.

    """
    # File Format:
    # version
    # key value
    # [...]

    __version = 3
    __version_key = '__version'

    def __init__(self, indexname, truncate=None):
        self._indexname = indexname
        dict.__init__(self)
        self.__changed = 0     # the index hasn't changed

        # Remove the database if truncate is true.
        if truncate and os.path.exists(indexname):
            os.unlink(indexname)
            self.__changed = 1

        # Load the database if it exists
        if os.path.exists(indexname):
            with open(indexname) as handle:
                version = self._toobj(handle.readline().rstrip())
                if version != self.__version:
                    raise IOError("Version %s doesn't match my version %s"
                                  % (version, self.__version))
                for line in handle:
                    key, value = line.split()
                    key, value = self._toobj(key), self._toobj(value)
                    self[key] = value
                self.__changed = 0

    def update(self, dict):
        self.__changed = 1
        dict.update(self, dict)

    def __setitem__(self, key, value):
        self.__changed = 1
        dict.__setitem__(self, key, value)

    def __delitem__(self, key):
        self.__changed = 1
        dict.__delitem__(self, key)

    def clear(self):
        self.__changed = 1
        dict.clear(self)

    def __del__(self):
        if self.__changed:
            with open(self._indexname, 'w') as handle:
                handle.write("%s\n" % self._tostr(self.__version))
                for key, value in self.items():
                    handle.write("%s %s\n" %
                                 (self._tostr(key), self._tostr(value)))

    def _tostr(self, obj):
        # I need a representation of the object that's saveable to
        # a file that uses whitespace as delimiters.  Thus, I'm
        # going to pickle the object, and then convert each character of
        # the string to its ASCII integer value.  Then, I'm going to convert
        # the integers into strings and join them together with commas.
        # It's not the most efficient way of storing things, but it's
        # relatively fast.
        s = pickle.dumps(obj)
        intlist = array.array('b', s)
        return ','.join(str(i) for i in intlist)

    def _toobj(self, str):
        intlist = [int(i) for i in str.split(',')]
        intlist = array.array('b', intlist)
        return pickle.loads(''.join(chr(i) for i in intlist))

Index = _InMemoryIndex