1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
|
"""Index a file based on information in a SeqRecord object.
This indexer tries to make it simple to index a file of records (ie. like
a GenBank file full of entries) so that individual records can be
readily retrieved.
The indexing in this file takes place by converting the elements in the
file into SeqRecord objects, and then indexing by some item in these
SeqRecords. This is a slower method, but is very flexible.
We have two default functions to index by the id and name elements of a
SeqRecord (ie. LOCUS and accession number from GenBank). There is also
a base class which you can derive from to create your own indexer
which allows you to index by anything you feel like using python code.
"""
from Bio.builders.SeqRecord.sequence import BuildSeqRecord
# --- base class for indexing using SeqRecord information
class BaseSeqRecordIndexer:
"""Base class for indexing using SeqRecord information.
This is the class you should derive from to index using some type of
information in a SeqRecord. This is an abstract base class, so it needs
to be subclassed to be useful.
"""
def __init__(self):
pass
def get_builder(self):
tricky_builder = FixDocumentBuilder(self.get_id_dictionary)
return tricky_builder
def primary_key_name(self):
raise NotImplementedError("Please implement in derived classes")
def secondary_key_names(self):
raise NotImplementedError("Please implement in derived classes")
def get_id_dictionary(self, seq_record):
raise NotImplementedError("Please implement in derived classes")
class SimpleIndexer(BaseSeqRecordIndexer):
"""Index a file based on .id and .name attributes of a SeqRecord.
A simple-minded indexing scheme which should work for simple cases. The
easiest way to use this is trhough the create_*db functions of this
module.
"""
def __init__(self):
BaseSeqRecordIndexer.__init__(self)
def primary_key_name(self):
return "id"
def secondary_key_names(self):
return ["name", "aliases"]
def get_id_dictionary(self, seq_record):
# XXX implement aliases once we have this attribute in SeqRecords
id_info = {"id" : [seq_record.id],
"name" : [seq_record.name],
"aliases" : []}
return id_info
class FunctionIndexer(BaseSeqRecordIndexer):
"""Indexer to index based on values returned by a function.
This class is passed a function which will return id, name and alias
information from a SeqRecord object. It needs to return either one item,
which is an id from the title, or three items which are (in order), the id,
a list of names, and a list of aliases.
This indexer allows indexing to be completely flexible based on passed
functions.
"""
def __init__(self, index_function):
BaseSeqRecordIndexer.__init__(self)
self.index_function = index_function
def primary_key_name(self):
return "id"
def secondary_key_names(self):
return ["name", "aliases"]
def get_id_dictionary(self, seq_record):
items = self.index_function(seq_record)
if type(items) is not type([]) and type(items) is not type(()):
items = [items]
if len(items) == 1:
seq_id = items[0]
name = []
aliases = []
elif len(items) == 3:
seq_id, name, aliases = items
else:
raise ValueError("Unexpected items from index function: %s" %
(items))
return {"id" : [seq_id],
"name" : name,
"aliases" : aliases}
class FixDocumentBuilder(BuildSeqRecord):
"""A SAX builder-style class to make a parsed SeqRecord available.
This class does a lot of trickery to make things fit in the SAX
framework and still have the flexibility to use a built SeqRecord
object.
You shouldn't really need to use this class unless you are doing
something really fancy-pants; otherwise, just use the
BaseSeqRecordIndexer interfaces.
"""
def __init__(self, get_ids_callback):
"""Intialize with a callback function to gets id info from a SeqRecord.
get_ids_callback should be a callable function that will take a
SeqRecord object and return a dictionary mapping id names to
the valid ids for these names.
"""
BuildSeqRecord.__init__(self)
self._ids_callback = get_ids_callback
def end_record(self, tag):
"""Overrride the builder function to muck with the document attribute.
"""
# first build up the SeqRecord
BuildSeqRecord.end_record(self, tag)
# now convert the SeqRecord into the dictionary that the indexer needs
self.document = self._ids_callback(self.document)
# --- convenience functions for indexing
# you should just use these unless you are doing something fancy
def create_berkeleydb(files, db_name, indexer = SimpleIndexer()):
from Bio.Mindy import BerkeleyDB
unique_name = indexer.primary_key_name()
alias_names = indexer.secondary_key_names()
creator = BerkeleyDB.create(db_name, unique_name, alias_names)
builder = indexer.get_builder()
for filename in files:
creator.load(filename, builder = builder, fileid_info = {})
creator.close()
def create_flatdb(files, db_name, indexer = SimpleIndexer()):
from Bio.Mindy import FlatDB
unique_name = indexer.primary_key_name()
alias_names = indexer.secondary_key_names()
creator = FlatDB.create(db_name, unique_name, alias_names)
builder = indexer.get_builder()
for filename in files:
creator.load(filename, builder = builder, fileid_info = {})
creator.close()
|