File: DBIdsClient.py

package info (click to toggle)
python-biopython 1.42-2
links: PTS
area: main
in suites: etch, etch-m68k
size: 17,584 kB
ctags: 12,272
sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (297 lines) | stat: -rwxr-xr-x 10,926 bytes
parent folder | download | duplicates (2)
"""Search and retrieve information given a set of database identifiers.

EUtils has two major modes.  One uses history while the other uses
database identifiers.  This is a high-level interface for working with
identifiers.  You should use this module to get information about a
set of known database identifiers.

See HistoryClient if you want to work with a large number of
identifiers or potentially large search results.

>>> from Bio import EUtils
>>> from Bio.EUtils import DBIdsClient
>>> client = DBIdsClient.DBIdsClient()
>>> result = client.search("dalke", retmax = 100)
>>> len(result)
30
>>> print result[0].efetch(retmode = "text", rettype = "abstract").read()

1: Pac Symp Biocomput  1997;:85-96

Using Tcl for molecular visualization and analysis.

Dalke A, Schulten K.

Beckman Institute, Urbana, IL 61801, USA.

Reading and manipulating molecular structure data is a standard task in every
molecular visualization and analysis program, but is rarely available in a form
readily accessible to the user. Instead, the development of new methods for
analysis, display, and interaction is often achieved by writing a new program,
rather than building on pre-existing software. We present the Tcl-based script
language used in our molecular modeling program, VMD, and show how it can access
information about the molecular structure, perform analysis, and graphically
display and animate the results. The commands are available to the user and make
VMD a useful environment for studying biomolecules.


PMID: 9390282 [PubMed - indexed for MEDLINE]

>>>


Find sequences similar to GI:4579714 which were published in 2002.

>>> protein = DBIdsClient.from_dbids(EUtils.DBIds("protein", "4579714"))
>>> neighbors = protein.neighbor_links("protein",
...        daterange = EUtils.DateRange("2002/01/01", "2002/12/31", "pdat"))
>>> dbids = neighbors.linksetdbs["protein_protein"].dbids
>>> len(dbids)
28
>>> print dbids
DBIds(u'protein', [u'4579714', u'25298947', u'24158913', u'24158914', u'24158915', u'17942993', u'17942994', u'17942995', u'20150921', u'20150922', u'20151159', u'25298949', u'19716034', u'20663737', u'20663738', u'20663741', u'24987328', u'25533128', u'25298946', u'25298948', u'23008597', u'20219020', u'21218340', u'21218344', u'19075395', u'21218338', u'21218342', u'21311795'])
>>> 
>>> print client.from_dbids(dbids[:5]).efetch(retmode="text",
...                                           rettype="summary").read()

1: BAA75200
Bacteriorhodopsin [Halobacterium sp.]
gi|4579714|dbj|BAA75200.1|[4579714]


2: H84300
bacteriorhodopsin [imported] - Halobacterium sp. NRC-1
gi|25298947|pir||H84300[25298947]


3: 1M0KA
Chain A, Bacteriorhodopsin K Intermediate At 1.43 A Resolution
gi|24158913|pdb|1M0K|A[24158913]


4: 1M0LA
Chain A, BacteriorhodopsinLIPID COMPLEX AT 1.47 A RESOLUTION
gi|24158914|pdb|1M0L|A[24158914]


5: 1M0MA
Chain A, Bacteriorhodopsin M1 Intermediate At 1.43 A Resolution
gi|24158915|pdb|1M0M|A[24158915]

>>>

"""

import types
import parse, Mixins, Config, ThinClient, Datatypes

class DBIdsLookup(object):
    """Look up information about a DBIds

    To get the list of dbids, as interpreted by fetching the
    server's "uilist", use the "dbids" attribute.
    """
    def __init__(self, eutils, records_dbids):
        self.eutils = eutils
        self.records_dbids = records_dbids

    def esummary(self, retmode = 'xml', rettype = None):
        """call esummary on this DBIds; returns the socket handle"""
        return self.eutils.esummary_using_dbids(
            dbids = self.records_dbids)

    def summary(self):
        """get the summary for these DBIds, parsed into a Datatypes.Summary"""
        return parse.parse_summary_xml(self.esummary("xml"))

    def elink(self,
              db = "pubmed",
              cmd = "neighbor",
              term = None,
              field = None,
              daterange = None):
        """call elink on this DBIds; returns the socket handle"""
        return self.eutils.elink_using_dbids(
            dbids = self.dbids,
            db = db,
            cmd = cmd,
            daterange = daterange,
            term = term,
            field = field,
            )

    def _get_dbids(self):
        infile = self.efetch(retmode = "text", rettype = "uilist")
        ids = parse.parse_fetch_identifiers(infile)
        return Datatypes.DBIds(self.records_dbids.db, ids)
    dbids = property(_get_dbids, None, None,
        "The DBIds for this results set, validated from the server's 'uilist'")
    
    
class DBIdsRecord(DBIdsLookup):
    """A single record on the server"""
    def summary(self):
        return DBIdsLookup.summary(self)[0]

class SequenceDBIdsFetchMixin:
    """Support 'efetch' for sequence records"""
    def efetch(self, retmode = 'xml', rettype = None,
               seq_start = None, seq_stop = None, strand = None,
               complexity = None):
        if strand not in (None, 1, 2):
            raise TypeError("Strand can only be 1 (plus, default) or 2 (minus)")
        return self.eutils.efetch_using_dbids(
            dbids = self.records_dbids,
            retmode = retmode,
            rettype = rettype,
            seq_start = seq_start,
            seq_stop = seq_stop,
            strand = strand,
            complexity = complexity)

class SequenceDBIdsRecord(Mixins.SequenceFetchMixin,
                          SequenceDBIdsFetchMixin,
                          DBIdsRecord):
    """a single sequence record, referenced by database identifier"""
    pass

class PublicationDBIdsFetchMixin:
    """Support 'efetch' for publication records"""
    def efetch(self, retmode = "xml", rettype = None):
        return self.eutils.efetch_using_dbids(
            dbids = self.records_dbids,
            retmode = retmode,
            rettype = rettype)

class PublicationDBIdsRecord(Mixins.PublicationFetchMixin,
                             PublicationDBIdsFetchMixin,
                             DBIdsRecord):
    """a single publication record, referenced by database identifier"""
    pass

class BaseDBIdsRecordSet(DBIdsLookup):
    """Base class for dealing with a set of records, reference by identifier"""
    def __init__(self, eutils, records_dbids, metadata = None):
        DBIdsLookup.__init__(self, eutils, records_dbids)
        self.metadata = metadata

    def __len__(self):
        """Number of records referenced by this RecordSet"""
        return len(self.records_dbids)

    def __getitem__(self, i):
        """Return subset of the records"""
        if isinstance(i, types.SliceType):
            # Metadata is not passed downwards
            if i.step is None:
                return self.__class__(
                    self.eutils,
                    self.records_dbids[i.start:i.stop])
            return self.__class__(
                self.eutils,
                self.records_dbids[i.start:i.stop:i.step])

        return self._record_class(self.eutils, self.records_dbids.item(i))
        
class SequenceDBIdsRecordSet(Mixins.SequenceFetchMixin,
                             SequenceDBIdsFetchMixin,
                             BaseDBIdsRecordSet):
    """a set of sequence records, referenced by database identifier"""
    _record_class = SequenceDBIdsRecord

class PublicationDBIdsRecordSet(Mixins.PublicationFetchMixin,
                                PublicationDBIdsFetchMixin,
                                BaseDBIdsRecordSet):
    """a set of publication records, referenced by database identifier"""
    _record_class = PublicationDBIdsRecord


def _get_recordset_constructor(db, dbtype):
    """get the right DataSet constructor for a database"""
    dbtype = Config.databases.gettype(db, dbtype)
    if dbtype == Config.SEQUENCE_TYPE:
        return SequenceDBIdsRecordSet
    elif dbtype == Config.PUBLICATION_TYPE:
        return PublicationDBIdsRecordSet
    else:
        raise TypeError("Unknown database type: %r" % (dbtype,))

def from_dbids(dbids, dbtype = None, eutils = None):
    """create a RecordSet interface for the set of database identifiers

    Parameters are:
      dbids -- a DBIds
      dbtype -- the dbtype to use (EUtils.Config.{SEQUENCE,PUBLIATION}_TYPE)
           in case dbids.db isn't in the list of know NCBI databases.
           Defaults to None.
      eutils -- the ThinClient to use, defaults to creating a new
           ThinClient.ThinClient()
    """
    return DBIdsClient(eutils).from_dbids(dbids, dbtype)

class DBIdsClient:
    """Create a RecordSet either from a search or a set of dbids

    The constructor takes an optional ThinClient to use for
    connecting to NCBI.
    """
    def __init__(self, eutils = None):
        if eutils is None:
            eutils = ThinClient.ThinClient()
        self.eutils = eutils

    def from_dbids(self, dbids, dbtype = None):
        """Return a RecordSet given the DBIds

        This RecordSet can be used to fetch data from NCBI
        related to the given DBIds.
        """
        set_klass = _get_recordset_constructor(dbids.db, dbtype)
        return set_klass(self.eutils, dbids, None)

    def search(self,
               term,
               db = "pubmed",
               field = None,

               retstart = 0,
               retmax = 20,

               daterange = None,
               dbtype = None,
               ):
        """do an Entrez search

        The parameters are:
          'term' -- the query string in the Entrez query language; see
             http://www.ncbi.nlm.nih.gov/entrez/query/static/help/pmhelp.html
          'db' -- the database to search

          'field' -- the field to use for unqualified words
                  Eg, "dalke[au] AND gene" with field==None becomes
                    dalke[au] AND (genes[MeSH Terms] OR gene[Text Word]
                  and "dalke[au] AND gene" with field=="au" becomes
                    dalke[au] AND genes[Author]
                 (Yes, I think the first "au" should be "Author" too)

          'retstart' -- include identifiers in the output, starting with
                   position 'retstart' (normally starts with 0)
          'retmax' -- return at most 'retmax' identifiers in the output
                   (if not specified, NCBI returns 20 identifiers)
          'daterange' -- a date restriction; either WithinNDays or DateRange
          
          'dbtype' -- (optional) the database type (Config.PUBLICATION_TYPE
                  or SEQUENCE_TYPE).  Overrides the type based on the 'db'
        """
        set_klass = _get_recordset_constructor(db, dbtype)
        infile = self.eutils.esearch(
            term = term,
            db = db,
            field = field,
            retstart = retstart,
            retmax = retmax,
            daterange = daterange)
        searchinfo = parse.parse_search(infile, [None])

        dbids = Datatypes.DBIds(db, searchinfo.ids)
        return set_klass(self.eutils, dbids, searchinfo)