1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
|
import os
from bsddb3 import db
import Location
import BaseDB
import Bio
_open = open # rename for internal use -- gets redefined below
INDEX_TYPE = "BerkeleyDB/1"
def create(dbname, primary_namespace, secondary_namespaces,
formatname = "unknown"):
os.mkdir(dbname)
config_filename = os.path.join(dbname, "config.dat")
BaseDB.write_config(config_filename = config_filename,
index_type = INDEX_TYPE,
primary_namespace = primary_namespace,
secondary_namespaces = secondary_namespaces,
fileid_info = {},
formatname = formatname
)
dbenv = db.DBEnv(0)
envflags = db.DB_THREAD | db.DB_INIT_MPOOL
dbenv.open(dbname, envflags | db.DB_CREATE)
primary_table = db.DB(dbenv)
primary_table.open("key_%s" % (primary_namespace,), None,
db.DB_BTREE, db.DB_CREATE, 0660)
secondary_tables = {}
for namespace in secondary_namespaces:
x = db.DB(dbenv)
x.open("id_%s" % (namespace,), None, db.DB_BTREE, db.DB_CREATE, 0)
secondary_tables[namespace] = x
for x in secondary_tables.values():
x.close()
primary_table.close()
dbenv.close()
return open(dbname, "rw")
class PrimaryNamespace(BaseDB.DictLookup):
def __init__(self, db, namespace):
self.db = db
self.namespace = namespace
assert namespace == db.primary_namespace
def __getitem__(self, name):
loc = self.db.primary_table[name]
filetag, startpos, length = loc.split("\t")
filename = self.db.fileid_info[filetag][0]
return [
Location.Location(self.namespace,
name,
filename,
long(startpos),
long(length))
]
def keys(self):
return self.db.primary_table.keys()
class SecondaryNamespace(BaseDB.DictLookup):
def __init__(self, db, namespace):
self.db = db
self.namespace = namespace
assert namespace in db.secondary_namespaces
def __getitem__(self, name):
table = self.db._load_namespace(self.namespace)
text = table.get(name, None)
if text is None:
raise KeyError("Cannot find %r key %r" % (self.namespace, name))
data = []
for key in text.split("\t"):
loc = self.db.primary_table[key]
filetag, start, length = loc.split("\t")
filename = self.db.fileid_info[filetag][0]
data.append(Location.Location(self.namespace,
name,
filename,
long(start),
long(length)))
return data
def keys(self):
table = self.db._load_namespace(self.namespace)
return table.keys()
class BerkeleyDB(BaseDB.OpenDB, BaseDB.WriteDB):
def __init__(self, dbname, mode = "r"):
if mode not in ("r", "rw"):
raise TypeError("Unknown mode: %r" % (mode,))
self.__need_flush = 0
BaseDB.OpenDB.__init__(self, dbname, INDEX_TYPE)
self.dbenv = None
dbenv = db.DBEnv()
envflags = db.DB_THREAD | db.DB_INIT_MPOOL
dbenv.open(dbname, envflags)
if mode == "r":
self._dbopen_flags = db.DB_RDONLY
else:
self._dbopen_flags = 0
self.primary_table = db.DB(dbenv)
self.primary_table.open("key_%s" % (self.primary_namespace,),
None,
db.DB_BTREE, self._dbopen_flags, 0660)
self.secondary_tables = {}
self.dbenv = dbenv
def _load_namespace(self, namespace):
dbname = "id_%s" % namespace
# Get the appropriate lookup table
if not self.secondary_tables.has_key(namespace):
# Nope, so load it up
self.secondary_tables[namespace] = db.DB(self.dbenv)
self.secondary_tables[namespace].open(dbname, None,
db.DB_BTREE,
self._dbopen_flags, 0)
return self.secondary_tables[namespace]
def add_record(self, filetag, startpos, length, table):
key_list = table[self.primary_namespace]
if len(key_list) != 1:
raise TypeError(
"Field %s has %d entries but must have only one "
"(must be unique)" % (repr(self.primary_namespace),
len(key_list)))
key = key_list[0]
if self.primary_table.has_key(key):
raise TypeError("Field %r = %r already exists" %
(self.primary_namespace, key))
self.primary_table[key] = "%s\t%s\t%s" % (filetag,
BaseDB._int_str(startpos),
BaseDB._int_str(length))
for namespace in self.secondary_namespaces:
lookup = self._load_namespace(namespace)
# Get the list of secondary identifiers for this identifier
for val in table.get(namespace, ()):
# Go from secondary identifier to list of primary identifiers
if lookup.has_key(val):
lookup[val] = lookup[val] + "\t" + key
else:
lookup[val] = key
self.__need_flush = 1
def flush(self):
if not self.__need_flush:
return
config_filename = os.path.join(self.dbname, "config.dat")
BaseDB.write_config(config_filename = config_filename,
index_type = INDEX_TYPE,
primary_namespace = self.primary_namespace,
secondary_namespaces =
self.secondary_tables.keys(),
fileid_info = self.fileid_info,
formatname = self.formatname,
)
self.__need_flush = 0
def close(self):
self.flush()
self.primary_table.close()
[x.close() for x in self.secondary_tables.values()]
self.dbenv.close()
self.dbenv = self.primary_table = self.fileid_info = \
self.secondary_tables = self.fileid_info = None
def __del__(self):
if self.dbenv is not None:
self.close()
def __getitem__(self, key):
if key not in self.keys():
raise KeyError(key)
if key == self.primary_namespace:
return PrimaryNamespace(self, key)
else:
return SecondaryNamespace(self, key)
open = BerkeleyDB
|