File: PDBList.py

package info (click to toggle)
python-biopython 1.42-2
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 17,584 kB
  • ctags: 12,272
  • sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (282 lines) | stat: -rw-r--r-- 9,687 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/python
#
# PDBList.py
#
# A tool for tracking changes in the PDB Protein Structure Database.
#
# (c) 2003 Kristian Rother
# This work was supported by the German Ministry of Education
# and Research (BMBF). Project http://www.bcbio.de
# 
# Contact the author
#    homepage : http://www.rubor.de/bioinf
#    email    : kristian.rother@charite.de
#
#
# This Code is released under the conditions of the Biopython license.
# It may be distributed freely with respect to the original author.
# Any maintainer of the BioPython code may change this notice
# when appropriate.
#
# Last modified on Tue, Oct 21st 2003, Berlin
#
# Removed 'write' options from retrieve_pdb_file method: it is not used.
# Also added a 'dir' options (pdb file is put in this directory if given),
# and an 'exist' option (test if the file is already there). This method
# now returns teh name of the downloaded uncompressed file.
#
# -Thomas, 1/06/04

__doc__="Access the PDB over the internet (for example to download structures)."

import urllib,string,re,os,sys

class PDBList:
    """
    This class provides quick access to the structure lists on the
    PDB server or its mirrors. The structure lists contain
    four-letter PDB codes, indicating that structures are
    new, have been modified or are obsolete. The lists are released
    on a weekly basis.

    It also provides a function to retrieve PDB files from the server.
    To use it properly, prepare a directory /pdb or the like,
    where PDB files are stored.

    If You want to use this module from inside a proxy, add
    the proxy variable to Your environment, e.g. in Unix
    export HTTP_PROXY='http://realproxy.charite.de:888'    
    (This can also be added to ~/.bashrc)
    """
    
    PDB_REF="""
    The Protein Data Bank: a computer-based archival file for macromolecular structures.
    F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
    J. Mol. Biol. 112 pp. 535-542 (1977)
    http://www.pdb.org/.
    """

    def __init__(self,server='ftp://ftp.pdb.mdc-berlin.de', pdb=os.sep+'pdb'):
        """Initialize the class with the default server or a custom one."""
        # remote pdb server
        self.pdb_server = server

        # local pdb file tree
        self.local_pdb = pdb

        
    def get_recent_filenames(self):
        """Returns names of the newest three weekly files (added,mod,obsolete).
        
        Reads the directories with changed entries from the PDB server and
        returns a tuple of three URL's to the files of new, modified and
        obsolete entries from the most recent list. The directory with the
        largest numerical name is used.
        Returns None if something goes wrong.
        
        Contents of the data/status dir (20031013 would be used);
drwxrwxr-x   2 1002     sysadmin     512 Oct  6 18:28 20031006
drwxrwxr-x   2 1002     sysadmin     512 Oct 14 02:14 20031013
-rw-r--r--   1 1002     sysadmin    1327 Mar 12  2001 README
        """     
        url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/')
        file = url.readlines()
        maxdate = 0
        for l in file:
            try:
                # check if this is a valid date
                date = int(l[54:62])
                if date > maxdate: maxdate = date
            except:
                pass
        if maxdate>0:
            return self.get_filenames_by_date(str(maxdate))
        return None

    def get_filenames_by_date(self,date):
        """Returns URL's of three weekly files (added,mod,obsolete).
        
        Returns names of added, modified and obsolete pdb status files
        for a given date, e.g. '20031013'.
        """
        path = self.pdb_server+'/pub/pdb/data/status/%s/'%(date)
        return (path+'added.pdb',path+'modified.pdb',path+'obsolete.pdb')

    def get_list(self,url):
        """Retrieves a list of pdb codes from the given URL.
        
        Returns a list of pdb codes in the pdb status file with the
        given URL. The URLs are created by get_recent_filenames() or
        get_filenames_by_date(date).
        
        Typical contents of the list files parsed by this method;
-rw-r--r--   1 rcsb     rcsb      330156 Oct 14  2003 pdb1cyq.ent
-rw-r--r--   1 rcsb     rcsb      333639 Oct 14  2003 pdb1cz0.ent
        """
        url = urllib.urlopen(url)
        file = url.readlines()
        list = []
        for l in file:
            try:
                if l[61:65] == '.ent': list.append(l[57:61]) 
            except:
                pass
        return list

    def get_all_obsolete(self):
        """Returns a list of all obsolete entries ever in the PDB.

        Returns a list of all obsolete pdb codes that have ever been
        in the PDB.
        
        Gets and parses the file from the PDB server in the format
        (the first pdb_code column is the one used).
 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
OBSLTE     30-SEP-03 1Q1D      1QZR
OBSLTE     26-SEP-03 1DYV      1UN2    
        """
        url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat')
        file = url.readlines()
        obsolete = []
        for l in file:
            if l[:6] == 'OBSLTE':
                pdb_code = l[21:25]
                obsolete.append(string.lower(pdb_code))

        return obsolete

    def changed_this_week(self):
        """Returns 3 lists of new/modified/obsolete PDB entries for weekly updates.
        
        Returns all three lists (new, modified, obsolete) pdb codes
        for this week.
        Uses get_recent_status() and get_list() for that.
        """
        urls = self.get_recent_filenames()
        tw = []
        tw.append(self.get_list(urls[0]))
        tw.append(self.get_list(urls[1]))
        tw.append(self.get_list(urls[2]))
        return tw

    def retrieve_pdb_file(self,pdb_code, compression='.Z', 
            uncompress="gunzip", dir=None, exist=0):
        """Retrieves a PDB structure file from the PDB server and
        stores it in a local file tree.
        The PDB structure is returned as a single string.
        The compression should be '.Z' or '.gz'. 'uncompress' is
        the command called to uncompress the files.

        @param dir: put the file in this directory (default: create a PDB-style directory tree) 
        @type dir: string

        @return: filename
        @rtype: string
        """
        # get the structure
        code=string.lower(pdb_code)
        filename="pdb%s.ent%s"%(code,compression)
        url=(self.pdb_server+
             '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s'
             % (code[1:3],code,compression))
        # in which dir to put the pdb file?
        if dir is None:
            # Put in PDB style directory tree
            path=self.local_pdb+os.sep+code[1:3]
        else:
            # Put in specified directory
            path=dir
        if not os.access(path,os.F_OK):
            os.mkdir(path)
        filename=path+os.sep+filename
        # the ifinal uncompressed file
        final_file=path+os.sep+"pdb%s.ent" % code
        if exist:
            if os.path.exists(final_file):
                return final_file
        # Read the file
        lines=urllib.urlopen(url).read()
        open(filename,'wb').write(lines)
        # uncompress the file
        os.system("%s %s" % (uncompress, filename))

        return final_file
            

    def update_pdb(self):
        """
        I guess this is the 'most wanted' function from this module.
        It gets the weekly lists of new and modified pdb entries and
        automatically downloads the according PDB files.
        You can call this module as a weekly cronjob.
        """
        changes  = self.changed_this_week()
        new      = changes[0]
        modified = changes[1]
        to_download = new + modified

        for pdb_code in to_download:
            try:
                print 'retrieving %s'%(pdb_code)            
                self.retrieve_pdb_file(pdb_code)
            except:
                print 'error %s'%(pdb_code)
                # you can insert here some more log notes that
                # something has gone wrong.            

        #
        # delete the obsolete files
        #    this part could easily misbehave, so i commented it out.
        #
        # obsolete = changes[2]
        # for pdb_code in obsolete:
        #     file = self.local_pdb + os.sep + pdb_code[1:3] + os.sep + 'pdb%s.ent'%(pdb_code)
        #     os.remove(file)


if __name__ == '__main__':
    doc = """PDBList.py
    (c) Kristian Rother 2003, Contributed to BioPython

    Standalone usage

    PDBList.py update - write weekly PDB updates to local /pdb tree.
    PDBList.py        - simple usage examples.
    
    """
    print doc

    if len(sys.argv)>1:
        # update PDB 
        if sys.argv[1] == 'update':
            pl = PDBList()
            pl.update_pdb()
            sys.exit(0)

    #
    # usage example
    #
    
    # 1. create object
    pl = PDBList()

    # 2. get all obsolete structure codes
    print "\nAll obsolete structures from the PDB server:"
    obsolete = pl.get_all_obsolete()
    print string.join(obsolete,'    ')

    # 3. get the weekly updated lists
    changes  = pl.changed_this_week()
    print "\nThis weeks new structures:"    
    new      = changes[0]
    print string.join(new,'    ')

    print "\nThis weeks modified structures:"    
    modified = changes[1]
    print string.join(modified,'    ')

    print "\nThis weeks obsolete structures:"    
    obsolete = changes[2]
    print string.join(obsolete,'    ')