File: PDBList.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 46,856 kB
  • sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (361 lines) | stat: -rw-r--r-- 13,494 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#!/usr/bin/env python
#
# PDBList.py
#
# A tool for tracking changes in the PDB Protein Structure Database.
#
# (c) 2003 Kristian Rother
# This work was supported by the German Ministry of Education
# and Research (BMBF). Project http://www.bcbio.de
#
# Contact the author
#    homepage : http://www.rubor.de/bioinf
#    email    : krother@genesilico.pl
#
#
# This code is released under the conditions of the Biopython license.
# It may be distributed freely with respect to the original author.
# Any maintainer of the Biopython code may change this notice
# when appropriate.

""" Access the PDB over the internet (e.g. to download structures). """

from __future__ import print_function

import contextlib
import gzip
import os
import shutil

# Importing these functions with leading underscore as not intended for reuse
from Bio._py3k import urlopen as _urlopen
from Bio._py3k import urlretrieve as _urlretrieve


class PDBList(object):
    """
    This class provides quick access to the structure lists on the
    PDB server or its mirrors. The structure lists contain
    four-letter PDB codes, indicating that structures are
    new, have been modified or are obsolete. The lists are released
    on a weekly basis.

    It also provides a function to retrieve PDB files from the server.
    To use it properly, prepare a directory /pdb or the like,
    where PDB files are stored.

    If you want to use this module from inside a proxy, add
    the proxy variable to your environment, e.g. in Unix:
    export HTTP_PROXY='http://realproxy.charite.de:888'
    (This can also be added to ~/.bashrc)
    """

    PDB_REF = """
    The Protein Data Bank: a computer-based archival file for macromolecular structures.
    F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
    J. Mol. Biol. 112 pp. 535-542 (1977)
    http://www.pdb.org/.
    """

    alternative_download_url = "http://www.rcsb.org/pdb/files/"
    # just append PDB code to this, and then it works.

    def __init__(self, server='ftp://ftp.wwpdb.org', pdb=os.getcwd(),
                 obsolete_pdb=None):
        """Initialize the class with the default server or a custom one."""
        self.pdb_server = server  # remote pdb server
        self.local_pdb = pdb  # local pdb file tree

        # local file tree for obsolete pdb files
        if obsolete_pdb:
            self.obsolete_pdb = obsolete_pdb
        else:
            self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete')
            if not os.access(self.obsolete_pdb, os.F_OK):
                os.makedirs(self.obsolete_pdb)

        # variables for command-line options
        self.overwrite = 0
        self.flat_tree = 0

    def get_status_list(self, url):
        """Retrieves a list of pdb codes in the weekly pdb status file
        from the given URL. Used by get_recent_files.

        Typical contents of the list files parsed by this method is now
        very simply one PDB name per line.
        """
        with contextlib.closing(_urlopen(url)) as handle:
            answer = []
            for line in handle:
                pdb = line.strip()
                assert len(pdb) == 4
                answer.append(pdb)
        return answer

    def get_recent_changes(self):
        """Returns three lists of the newest weekly files (added,mod,obsolete).

        Reads the directories with changed entries from the PDB server and
        returns a tuple of three URL's to the files of new, modified and
        obsolete entries from the most recent list. The directory with the
        largest numerical name is used.
        Returns None if something goes wrong.

        Contents of the data/status dir (20031013 would be used);
        drwxrwxr-x   2 1002     sysadmin     512 Oct  6 18:28 20031006
        drwxrwxr-x   2 1002     sysadmin     512 Oct 14 02:14 20031013
        -rw-r--r--   1 1002     sysadmin    1327 Mar 12  2001 README
        """
        url = self.pdb_server + '/pub/pdb/data/status/'
        with contextlib.closing(_urlopen(url)) as handle:
            recent = filter(str.isdigit,
                            (x.split()[-1] for x in handle.readlines())
                            )[-1]

        path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent)

        # Retrieve the lists
        added = self.get_status_list(path + 'added.pdb')
        modified = self.get_status_list(path + 'modified.pdb')
        obsolete = self.get_status_list(path + 'obsolete.pdb')
        return [added, modified, obsolete]

    def get_all_entries(self):
        """Retrieves a big file containing all the
        PDB entries and some annotation to them.
        Returns a list of PDB codes in the index file.
        """
        print("retrieving index file. Takes about 5 MB.")
        url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx'
        with contextlib.closing(_urlopen(url)) as handle:
            all_entries = [line[:4] for line in handle.readlines()[2:]
                           if len(line) > 4]
        return all_entries

    def get_all_obsolete(self):
        """Returns a list of all obsolete entries ever in the PDB.

        Returns a list of all obsolete pdb codes that have ever been
        in the PDB.

        Gets and parses the file from the PDB server in the format
        (the first pdb_code column is the one used). The file looks
        like this::

             LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
            OBSLTE    31-JUL-94 116L     216L
            ...
            OBSLTE    29-JAN-96 1HFT     2HFT
            OBSLTE    21-SEP-06 1HFV     2J5X
            OBSLTE    21-NOV-03 1HG6
            OBSLTE    18-JUL-84 1HHB     2HHB 3HHB
            OBSLTE    08-NOV-96 1HID     2HID
            OBSLTE    01-APR-97 1HIU     2HIU
            OBSLTE    14-JAN-04 1HKE     1UUZ
            ...

        """
        url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat'
        with contextlib.closing(_urlopen(url)) as handle:
            # Extract pdb codes. Could use a list comprehension, but I want
            # to include an assert to check for mis-reading the data.
            obsolete = []
            for line in handle:
                if not line.startswith("OBSLTE "):
                    continue
                pdb = line.split()[2]
                assert len(pdb) == 4
                obsolete.append(pdb)
        return obsolete

    def retrieve_pdb_file(self, pdb_code, obsolete=False, pdir=None):
        """ Retrieves a PDB structure file from the PDB server and
        stores it in a local file tree.

        The PDB structure's file name is returned as a single string.
        If obsolete ``==`` True, the file will be saved in a special file tree.

        @param pdir: put the file in this directory (default: create a PDB-style directory tree)
        @type pdir: string

        @return: filename
        @rtype: string
        """
        # Get the compressed PDB structure
        code = pdb_code.lower()
        archive_fn = "pdb%s.ent.gz" % code
        pdb_dir = "divided" if not obsolete else "obsolete"
        url = (self.pdb_server +
               '/pub/pdb/data/structures/%s/pdb/%s/%s' %
               (pdb_dir, code[1:3], archive_fn))

        # Where does the final PDB file get saved?
        if pdir is None:
            path = self.local_pdb if not obsolete else self.obsolete_pdb
            if not self.flat_tree:  # Put in PDB-style directory tree
                path = os.path.join(path, code[1:3])
        else:  # Put in specified directory
            path = pdir
        if not os.access(path, os.F_OK):
            os.makedirs(path)

        filename = os.path.join(path, archive_fn)
        final_file = os.path.join(path, "pdb%s.ent" % code)  # (decompressed)

        # Skip download if the file already exists
        if not self.overwrite:
            if os.path.exists(final_file):
                print("Structure exists: '%s' " % final_file)
                return final_file

        # Retrieve the file
        print("Downloading PDB structure '%s'..." % pdb_code)
        _urlretrieve(url, filename)

        # Uncompress the archive, delete when done
        # Can't use context manager with gzip.open until Python 2.7
        gz = gzip.open(filename, 'rb')
        with open(final_file, 'wb') as out:
            out.writelines(gz)
        gz.close()
        os.remove(filename)

        return final_file

    def update_pdb(self):
        """
        I guess this is the 'most wanted' function from this module.
        It gets the weekly lists of new and modified pdb entries and
        automatically downloads the according PDB files.
        You can call this module as a weekly cronjob.
        """
        assert os.path.isdir(self.local_pdb)
        assert os.path.isdir(self.obsolete_pdb)

        new, modified, obsolete = self.get_recent_changes()

        for pdb_code in new + modified:
            try:
                self.retrieve_pdb_file(pdb_code)
            except Exception:
                print('error %s\n' % pdb_code)
                # you can insert here some more log notes that
                # something has gone wrong.

        # Move the obsolete files to a special folder
        for pdb_code in obsolete:
            if self.flat_tree:
                old_file = os.path.join(self.local_pdb,
                                        'pdb%s.ent' % pdb_code)
                new_dir = self.obsolete_pdb
            else:
                old_file = os.path.join(self.local_pdb, pdb_code[1:3],
                                        'pdb%s.ent' % pdb_code)
                new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3])
            new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code)
            if os.path.isfile(old_file):
                if not os.path.isdir(new_dir):
                    os.mkdir(new_dir)
                try:
                    shutil.move(old_file, new_file)
                except Exception:
                    print("Could not move %s to obsolete folder" % old_file)
            elif os.path.isfile(new_file):
                print("Obsolete file %s already moved" % old_file)
            else:
                print("Obsolete file %s is missing" % old_file)

    def download_entire_pdb(self, listfile=None):
        """Retrieve all PDB entries not present in the local PDB copy.

        Writes a list file containing all PDB codes (optional, if listfile is
        given).
        """
        entries = self.get_all_entries()
        for pdb_code in entries:
            self.retrieve_pdb_file(pdb_code)
        # Write the list
        if listfile:
            with open(listfile, 'w') as outfile:
                outfile.writelines((x + '\n' for x in entries))

    def download_obsolete_entries(self, listfile=None):
        """Retrieve all obsolete PDB entries not present in the local obsolete
        PDB copy.

        Writes a list file containing all PDB codes (optional, if listfile is
        given).
        """
        entries = self.get_all_obsolete()
        for pdb_code in entries:
            self.retrieve_pdb_file(pdb_code, obsolete=1)

        # Write the list
        if listfile:
            with open(listfile, 'w') as outfile:
                outfile.writelines((x + '\n' for x in entries))

    def get_seqres_file(self, savefile='pdb_seqres.txt'):
        """Retrieves a (big) file containing all the sequences of PDB entries
        and writes it to a file.
        """
        print("Retrieving sequence file (takes about 15 MB).")
        url = self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt'
        _urlretrieve(url, savefile)


if __name__ == '__main__':

    import sys

    doc = """PDBList.py
    (c) Kristian Rother 2003, Contributed to BioPython

    Usage:
    PDBList.py update <pdb_path> [options]   - write weekly PDB updates to
                                               local pdb tree.
    PDBList.py all    <pdb_path> [options]   - write all PDB entries to
                                               local pdb tree.
    PDBList.py obsol  <pdb_path> [options]   - write all obsolete PDB
                                               entries to local pdb tree.
    PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure

    Options:
       -d   A single directory will be used as <pdb_path>, not a tree.
       -o   Overwrite existing structure files.
    """
    print(doc)

    if len(sys.argv) > 2:
        pdb_path = sys.argv[2]
        pl = PDBList(pdb=pdb_path)
        if len(sys.argv) > 3:
            for option in sys.argv[3:]:
                if option == '-d':
                    pl.flat_tree = 1
                elif option == '-o':
                    pl.overwrite = 1

    else:
        pdb_path = os.getcwd()
        pl = PDBList()
        pl.flat_tree = 1

    if len(sys.argv) > 1:
        if sys.argv[1] == 'update':
            # update PDB
            print("updating local PDB at " + pdb_path)
            pl.update_pdb()

        elif sys.argv[1] == 'all':
            # get the entire PDB
            pl.download_entire_pdb()

        elif sys.argv[1] == 'obsol':
            # get all obsolete entries
            pl.download_obsolete_entries(pdb_path)

        elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit():
            # get single PDB entry
            pl.retrieve_pdb_file(sys.argv[1], pdir=pdb_path)