1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
|
#!/usr/bin/python
#
# PDBList.py
#
# A tool for tracking changes in the PDB Protein Structure Database.
#
# (c) 2003 Kristian Rother
# This work was supported by the German Ministry of Education
# and Research (BMBF). Project http://www.bcbio.de
#
# Contact the author
# homepage : http://www.rubor.de/bioinf
# email : kristian.rother@charite.de
#
#
# This Code is released under the conditions of the Biopython license.
# It may be distributed freely with respect to the original author.
# Any maintainer of the BioPython code may change this notice
# when appropriate.
#
# Last modified on Tue, Oct 21st 2003, Berlin
#
# Removed 'write' options from retrieve_pdb_file method: it is not used.
# Also added a 'dir' options (pdb file is put in this directory if given),
# and an 'exist' option (test if the file is already there). This method
# now returns teh name of the downloaded uncompressed file.
#
# -Thomas, 1/06/04
__doc__="Access the PDB over the internet (for example to download structures)."
import urllib,string,re,os,sys
class PDBList:
"""
This class provides quick access to the structure lists on the
PDB server or its mirrors. The structure lists contain
four-letter PDB codes, indicating that structures are
new, have been modified or are obsolete. The lists are released
on a weekly basis.
It also provides a function to retrieve PDB files from the server.
To use it properly, prepare a directory /pdb or the like,
where PDB files are stored.
If You want to use this module from inside a proxy, add
the proxy variable to Your environment, e.g. in Unix
export HTTP_PROXY='http://realproxy.charite.de:888'
(This can also be added to ~/.bashrc)
"""
PDB_REF="""
The Protein Data Bank: a computer-based archival file for macromolecular structures.
F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
J. Mol. Biol. 112 pp. 535-542 (1977)
http://www.pdb.org/.
"""
def __init__(self,server='ftp://ftp.pdb.mdc-berlin.de', pdb=os.sep+'pdb'):
"""Initialize the class with the default server or a custom one."""
# remote pdb server
self.pdb_server = server
# local pdb file tree
self.local_pdb = pdb
def get_recent_filenames(self):
"""Returns names of the newest three weekly files (added,mod,obsolete).
Reads the directories with changed entries from the PDB server and
returns a tuple of three URL's to the files of new, modified and
obsolete entries from the most recent list. The directory with the
largest numerical name is used.
Returns None if something goes wrong.
Contents of the data/status dir (20031013 would be used);
drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006
drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013
-rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README
"""
url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/')
file = url.readlines()
maxdate = 0
for l in file:
try:
# check if this is a valid date
date = int(l[54:62])
if date > maxdate: maxdate = date
except:
pass
if maxdate>0:
return self.get_filenames_by_date(str(maxdate))
return None
def get_filenames_by_date(self,date):
"""Returns URL's of three weekly files (added,mod,obsolete).
Returns names of added, modified and obsolete pdb status files
for a given date, e.g. '20031013'.
"""
path = self.pdb_server+'/pub/pdb/data/status/%s/'%(date)
return (path+'added.pdb',path+'modified.pdb',path+'obsolete.pdb')
def get_list(self,url):
"""Retrieves a list of pdb codes from the given URL.
Returns a list of pdb codes in the pdb status file with the
given URL. The URLs are created by get_recent_filenames() or
get_filenames_by_date(date).
Typical contents of the list files parsed by this method;
-rw-r--r-- 1 rcsb rcsb 330156 Oct 14 2003 pdb1cyq.ent
-rw-r--r-- 1 rcsb rcsb 333639 Oct 14 2003 pdb1cz0.ent
"""
url = urllib.urlopen(url)
file = url.readlines()
list = []
for l in file:
try:
if l[61:65] == '.ent': list.append(l[57:61])
except:
pass
return list
def get_all_obsolete(self):
"""Returns a list of all obsolete entries ever in the PDB.
Returns a list of all obsolete pdb codes that have ever been
in the PDB.
Gets and parses the file from the PDB server in the format
(the first pdb_code column is the one used).
LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
OBSLTE 30-SEP-03 1Q1D 1QZR
OBSLTE 26-SEP-03 1DYV 1UN2
"""
url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat')
file = url.readlines()
obsolete = []
for l in file:
if l[:6] == 'OBSLTE':
pdb_code = l[21:25]
obsolete.append(string.lower(pdb_code))
return obsolete
def changed_this_week(self):
"""Returns 3 lists of new/modified/obsolete PDB entries for weekly updates.
Returns all three lists (new, modified, obsolete) pdb codes
for this week.
Uses get_recent_status() and get_list() for that.
"""
urls = self.get_recent_filenames()
tw = []
tw.append(self.get_list(urls[0]))
tw.append(self.get_list(urls[1]))
tw.append(self.get_list(urls[2]))
return tw
def retrieve_pdb_file(self,pdb_code, compression='.Z',
uncompress="gunzip", dir=None, exist=0):
"""Retrieves a PDB structure file from the PDB server and
stores it in a local file tree.
The PDB structure is returned as a single string.
The compression should be '.Z' or '.gz'. 'uncompress' is
the command called to uncompress the files.
@param dir: put the file in this directory (default: create a PDB-style directory tree)
@type dir: string
@return: filename
@rtype: string
"""
# get the structure
code=string.lower(pdb_code)
filename="pdb%s.ent%s"%(code,compression)
url=(self.pdb_server+
'/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s'
% (code[1:3],code,compression))
# in which dir to put the pdb file?
if dir is None:
# Put in PDB style directory tree
path=self.local_pdb+os.sep+code[1:3]
else:
# Put in specified directory
path=dir
if not os.access(path,os.F_OK):
os.mkdir(path)
filename=path+os.sep+filename
# the ifinal uncompressed file
final_file=path+os.sep+"pdb%s.ent" % code
if exist:
if os.path.exists(final_file):
return final_file
# Read the file
lines=urllib.urlopen(url).read()
open(filename,'wb').write(lines)
# uncompress the file
os.system("%s %s" % (uncompress, filename))
return final_file
def update_pdb(self):
"""
I guess this is the 'most wanted' function from this module.
It gets the weekly lists of new and modified pdb entries and
automatically downloads the according PDB files.
You can call this module as a weekly cronjob.
"""
changes = self.changed_this_week()
new = changes[0]
modified = changes[1]
to_download = new + modified
for pdb_code in to_download:
try:
print 'retrieving %s'%(pdb_code)
self.retrieve_pdb_file(pdb_code)
except:
print 'error %s'%(pdb_code)
# you can insert here some more log notes that
# something has gone wrong.
#
# delete the obsolete files
# this part could easily misbehave, so i commented it out.
#
# obsolete = changes[2]
# for pdb_code in obsolete:
# file = self.local_pdb + os.sep + pdb_code[1:3] + os.sep + 'pdb%s.ent'%(pdb_code)
# os.remove(file)
if __name__ == '__main__':
doc = """PDBList.py
(c) Kristian Rother 2003, Contributed to BioPython
Standalone usage
PDBList.py update - write weekly PDB updates to local /pdb tree.
PDBList.py - simple usage examples.
"""
print doc
if len(sys.argv)>1:
# update PDB
if sys.argv[1] == 'update':
pl = PDBList()
pl.update_pdb()
sys.exit(0)
#
# usage example
#
# 1. create object
pl = PDBList()
# 2. get all obsolete structure codes
print "\nAll obsolete structures from the PDB server:"
obsolete = pl.get_all_obsolete()
print string.join(obsolete,' ')
# 3. get the weekly updated lists
changes = pl.changed_this_week()
print "\nThis weeks new structures:"
new = changes[0]
print string.join(new,' ')
print "\nThis weeks modified structures:"
modified = changes[1]
print string.join(modified,' ')
print "\nThis weeks obsolete structures:"
obsolete = changes[2]
print string.join(obsolete,' ')
|