File: pdb_fetch.py

package info (click to toggle)
pdb-tools 2.6.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,644 kB
  • sloc: python: 9,242; makefile: 13
file content (177 lines) | stat: -rw-r--r-- 4,800 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2018 João Pedro Rodrigues
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Downloads a structure in PDB format from the RCSB website.

Allows downloading the (first) biological structure if selected.

Usage:
    python pdb_fetch.py [-biounit] <pdb code>

Example:
    python pdb_fetch.py 1brs  # downloads unit cell, all 6 chains
    python pdb_fetch.py -biounit 1brs  # downloads biounit, 2 chains

This program is part of the `pdb-tools` suite of utilities and should not be
distributed isolatedly. The `pdb-tools` were created to quickly manipulate PDB
files using the terminal, and can be used sequentially, with one tool streaming
data to another. They are based on old FORTRAN77 code that was taking too much
effort to maintain and compile. RIP.
"""

import gzip
import re
import sys

# Python 3 vs Python 2
if sys.version_info[0] < 3:
    from cStringIO import StringIO as IO
    from urllib2 import Request, build_opener
    from urllib2 import HTTPError
else:
    from io import BytesIO as IO
    from urllib.request import Request, build_opener
    from urllib.error import HTTPError

__author__ = "Joao Rodrigues"
__email__ = "j.p.g.l.m.rodrigues@gmail.com"


def check_input(args):
    """Checks whether to read from stdin/file and validates user input/options.
    """

    # Defaults
    option = False

    if len(args) == 1:
        # pdb code only
        if not re.match(r'[0-9a-zA-Z]{4}$', args[0]):
            emsg = 'ERROR!! Invalid PDB code: \'{}\'\n'
            sys.stderr.write(emsg.format(args[0]))
            sys.stderr.write(__doc__)
            sys.exit(1)

        pdb_code = args[0]

    elif len(args) == 2:
        # biounit & pdb code
        if not re.match(r'\-biounit$', args[0]):
            emsg = 'ERROR!! Invalid option: \'{}\'\n'
            sys.stderr.write(emsg.format(args[0]))
            sys.stderr.write(__doc__)
            sys.exit(1)

        if not re.match(r'[0-9a-zA-Z]{4}$', args[1]):
            emsg = 'ERROR!! Invalid PDB code: \'{}\'\n'
            sys.stderr.write(emsg.format(args[0]))
            sys.stderr.write(__doc__)
            sys.exit(1)

        option = True
        pdb_code = args[1]
    else:
        sys.stderr.write(__doc__)
        sys.exit(1)

    return (pdb_code, option)


def run(pdbid, biounit=False):
    """
    Download the structure in PDB format from the RCSB PDB website.

    This function is a generator.

    Parameters
    ----------
    pdbid : str
        The alpha-numeric code of the PBDID.

    biounit : bool
        Whether to download biounit version.

    Yield
    -----
    str (line-by-line)
        The original PBD data.
    """

    base_url = 'https://files.rcsb.org/download/'
    pdb_type = '.pdb1' if biounit else '.pdb'
    pdb_url = base_url + pdbid.lower() + pdb_type + '.gz'

    try:
        request = Request(pdb_url)
        opener = build_opener()
        url_data = opener.open(request).read()

    except HTTPError as e:
        emsg = '[!] Error fetching structure: ({0}) {1}\n'
        sys.stderr.write(emsg.format(e.code, e.msg))
        return

    else:

        try:
            buf = IO(url_data)
            gz_handle = gzip.GzipFile(fileobj=buf, mode='rb')
            for line in gz_handle:
                yield line.decode('utf-8')

        except IOError as e:
            emsg = '[!] Error fetching structure: ({0}) {1}\n'
            sys.stderr.write(emsg.format(e.code, e.msg))
            return

        finally:
            gz_handle.close()


fetch_structure = run


def main():
    # Check Input
    pdb_code, biounit = check_input(sys.argv[1:])

    # Do the job
    new_pdb = run(pdb_code, biounit)

    try:
        _buffer = []
        _buffer_size = 5000  # write N lines at a time
        for lineno, line in enumerate(new_pdb):
            if not (lineno % _buffer_size):
                sys.stdout.write(''.join(_buffer))
                _buffer = []
            _buffer.append(line)

        sys.stdout.write(''.join(_buffer))
        sys.stdout.flush()
    except IOError:
        # This is here to catch Broken Pipes
        # for example to use 'head' or 'tail' without
        # the error message showing up
        pass

    sys.exit(0)


if __name__ == '__main__':
    main()