File: SimpleSeqRecord.py

package info (click to toggle)
python-biopython 1.42-2
links: PTS
area: main
in suites: etch, etch-m68k
size: 17,584 kB
ctags: 12,272
sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (153 lines) | stat: -rw-r--r-- 5,803 bytes
parent folder | download | duplicates (3)
"""Index a file based on information in a SeqRecord object.

This indexer tries to make it simple to index a file of records (ie. like
a GenBank file full of entries) so that individual records can be 
readily retrieved.

The indexing in this file takes place by converting the elements in the
file into SeqRecord objects, and then indexing by some item in these 
SeqRecords. This is a slower method, but is very flexible.

We have two default functions to index by the id and name elements of a
SeqRecord (ie. LOCUS and accession number from GenBank). There is also 
a base class which you can derive from to create your own indexer
which allows you to index by anything you feel like using python code.
"""
from Bio.builders.SeqRecord.sequence import BuildSeqRecord

# --- base class for indexing using SeqRecord information
class BaseSeqRecordIndexer:
    """Base class for indexing using SeqRecord information.

    This is the class you should derive from to index using some type of
    information in a SeqRecord. This is an abstract base class, so it needs
    to be subclassed to be useful.
    """
    def __init__(self):
        pass

    def get_builder(self):
        tricky_builder = FixDocumentBuilder(self.get_id_dictionary)
        return tricky_builder

    def primary_key_name(self):
        raise NotImplementedError("Please implement in derived classes")
    
    def secondary_key_names(self):
        raise NotImplementedError("Please implement in derived classes")

    def get_id_dictionary(self, seq_record):
        raise NotImplementedError("Please implement in derived classes")

class SimpleIndexer(BaseSeqRecordIndexer):
    """Index a file based on .id and .name attributes of a SeqRecord.

    A simple-minded indexing scheme which should work for simple cases. The
    easiest way to use this is trhough the create_*db functions of this
    module.
    """
    def __init__(self):
        BaseSeqRecordIndexer.__init__(self)

    def primary_key_name(self):
        return "id"

    def secondary_key_names(self):
        return ["name", "aliases"]

    def get_id_dictionary(self, seq_record):
        # XXX implement aliases once we have this attribute in SeqRecords
        id_info = {"id" : [seq_record.id],
                   "name" : [seq_record.name],
                   "aliases" : []}
        return id_info

class FunctionIndexer(BaseSeqRecordIndexer):
    """Indexer to index based on values returned by a function.
    
    This class is passed a function which will return id, name and alias
    information from a SeqRecord object. It needs to return either one item, 
    which is an id from the title, or three items which are (in order), the id, 
    a list of names, and a list of aliases.

    This indexer allows indexing to be completely flexible based on passed
    functions.
    """
    def __init__(self, index_function):
        BaseSeqRecordIndexer.__init__(self)
        self.index_function = index_function

    def primary_key_name(self):
        return "id"

    def secondary_key_names(self):
        return ["name", "aliases"]

    def get_id_dictionary(self, seq_record):
        items = self.index_function(seq_record)
        if type(items) is not type([]) and type(items) is not type(()):
            items = [items]
        if len(items) == 1:
            seq_id = items[0]
            name = []
            aliases = []
        elif len(items) == 3:
            seq_id, name, aliases = items
        else:
            raise ValueError("Unexpected items from index function: %s" %
                    (items))
        
        return {"id" : [seq_id],
                "name" : name,
                "aliases" : aliases}

class FixDocumentBuilder(BuildSeqRecord):
    """A SAX builder-style class to make a parsed SeqRecord available.

    This class does a lot of trickery to make things fit in the SAX
    framework and still have the flexibility to use a built SeqRecord
    object. 

    You shouldn't really need to use this class unless you are doing 
    something really fancy-pants; otherwise, just use the 
    BaseSeqRecordIndexer interfaces.
    """
    def __init__(self, get_ids_callback):
        """Intialize with a callback function to gets id info from a SeqRecord.

        get_ids_callback should be a callable function that will take a 
        SeqRecord object and return a dictionary mapping id names to 
        the valid ids for these names.
        """
        BuildSeqRecord.__init__(self)
        self._ids_callback = get_ids_callback

    def end_record(self, tag):
        """Overrride the builder function to muck with the document attribute.
        """
        # first build up the SeqRecord
        BuildSeqRecord.end_record(self, tag)
        # now convert the SeqRecord into the dictionary that the indexer needs
        self.document = self._ids_callback(self.document)

# --- convenience functions for indexing
# you should just use these unless you are doing something fancy
def create_berkeleydb(files, db_name, indexer = SimpleIndexer()):
    from Bio.Mindy import BerkeleyDB
    unique_name = indexer.primary_key_name()
    alias_names = indexer.secondary_key_names()
    creator = BerkeleyDB.create(db_name, unique_name, alias_names)
    builder = indexer.get_builder()
    for filename in files:
        creator.load(filename, builder = builder, fileid_info = {})
    creator.close()

def create_flatdb(files, db_name, indexer = SimpleIndexer()):
    from Bio.Mindy import FlatDB
    unique_name = indexer.primary_key_name()
    alias_names = indexer.secondary_key_names()
    creator = FlatDB.create(db_name, unique_name, alias_names)
    builder = indexer.get_builder()
    for filename in files:
        creator.load(filename, builder = builder, fileid_info = {})
    creator.close()