File: pubchem.py

package info (click to toggle)
python-ase 3.24.0-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 15,448 kB
  • sloc: python: 144,945; xml: 2,728; makefile: 113; javascript: 47
file content (315 lines) | stat: -rw-r--r-- 12,305 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import json
import urllib.request
import warnings
from collections import namedtuple
from io import BytesIO, StringIO
from urllib.error import HTTPError, URLError

from ase.io import read

base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'

PubchemSearch = namedtuple('PubchemSearch', 'search field')


class PubchemData:
    """
    a specialized class for entries from the pubchem database
    """

    def __init__(self, atoms, data):
        self.atoms = atoms
        self.data = data

    def get_atoms(self):
        return self.atoms

    def get_pubchem_data(self):
        return self.data


def search_pubchem_raw(search, field, silent=False, mock_test=False):
    """
    A helper function for searching pubchem.

    Parameters:
        search (str or int):
            the compound you are searching for. This can be either
            a common name, CID, or smiles string depending of the
            `field` you are searching

        field (str):
            the particular field you are searching with. Possible values
            are 'name', 'CID', and 'smiles'.'name' will search common '
            'names,CID will search the Pubchem Chemical Idenitification '
            'Numberswhich can be found on their website and smiles'
            ' searches for compounds with the entered smiles string.

    returns:
        data (str):
            a string containing the raw response from pubchem.
    """
    if mock_test:  # for testing only
        r = BytesIO(test_output)
    else:
        suffix = 'sdf?record_type=3d'

        url = (
            f'{base_url}/{field}/{search!s}/{suffix}'
            if field == 'conformers'
            else f'{base_url}/compound/{field}/{search!s}/{suffix}'
        )
        try:
            r = urllib.request.urlopen(url)
        except HTTPError as e:
            raise ValueError(
                f'the search term {search} could not be found for the field '
                f'{field}'
            ) from e
        except URLError as e:
            raise ValueError(
                'Couldn\'t reach the pubchem servers, check'
                ' your internet connection'
            ) from e

    # check if there are confomers and warn them if there are
    if field != 'conformers' and not silent:
        conformer_ids = available_conformer_search(search, field,
                                                   mock_test=mock_test)
        if len(conformer_ids) > 1:
            warnings.warn(
                f'The structure "{search}" has more than one conformer in '
                'PubChem. By default, the first conformer is returned, please '
                'ensure you are using the structure you intend to or use the '
                '`ase.data.pubchem.pubchem_conformer_search` function'
            )

    return r.read().decode('utf-8')


def parse_pubchem_raw(data):
    """
    a helper function for parsing the returned pubchem entries

    Parameters:
        data (str):
            the raw output from pubchem in string form

    returns:
        atoms (ASE Atoms Object):
            An ASE atoms obejct containing the information from
            pubchem
        pubchem_data (dict):
            a dictionary containing the non-structural information
            from pubchem

    """
    if 'PUBCHEM_COMPOUND_CID' not in data:
        raise Exception('There was a problem with the data returned by '
                        'PubChem')
    f_like = StringIO(data)
    atoms = read(f_like, format='sdf')

    # check if there are confomers and warn them if there are

    # further analyze the text returned from pubchem
    pubchem_data = {}
    other_info = data.split('END\n')[1]
    other_info = other_info.split('$')[0]  # remove the $$$$ at the end
    # the strucuture of this string is > <field>\nentry_info\n
    other_info = other_info.split('> <')  # split into the fields
    for data_field in other_info:
        if data_field == '':
            continue
        field_name, entry_value = data_field.split('>\n')
        # split it into lines and remove the empty lines
        entry_value = entry_value.splitlines()
        entry_value = [a for a in entry_value if a != '']
        if len(entry_value) == 1:
            entry_value = entry_value[0]
        pubchem_data[field_name] = entry_value
    # recover partial charges
    if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data:
        # the first entry just contains the number of atoms with charges
        charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:]
        # each subsequent entry contains the index and charge of the atoms
        atom_charges = [0.] * len(atoms)
        for charge in charges:
            i, charge = charge.split()
            # indices start at 1
            atom_charges[int(i) - 1] = float(charge)
        atoms.set_initial_charges(atom_charges)
    return atoms, pubchem_data


def analyze_input(name=None, cid=None, smiles=None, conformer=None,
                  silent=False):
    """
    helper function to translate keyword arguments from intialization
    and searching into the search and field that is being asked for

    Parameters:
        see `ase.data.pubchem.pubchem_search`
    returns:
        search:
            the search term the user has entered
        field:
            the name of the field being asked for

    """
    inputs = [name, cid, smiles, conformer]
    inputs_check = [a is not None for a in [name, cid, smiles, conformer]]
    input_fields = ['name', 'cid', 'smiles', 'conformers']

    if inputs_check.count(True) > 1:
        raise ValueError('Only one search term my be entered a time.'
                         ' Please pass in only one of the following: '
                         'name, cid, smiles, confomer')
    elif inputs_check.count(True) == 1:
        # Figure out which input has been passed in
        index = inputs_check.index(True)
        field = input_fields[index]
        search = inputs[index]
    else:
        raise ValueError('No search was entered.'
                         ' Please pass in only one of the following: '
                         'name, cid, smiles, confomer')

    return PubchemSearch(search, field)


def available_conformer_search(search, field, mock_test=False):
    """
    Helper function to get the conformer IDs. This searches pubchem for
    the conformers of a given structure and returns all the confomer ids
    of a structure.

    Parameters:
        search (str or int):
            the compound you are searching for. This can be either
            a common name, CID, or smiles string depending of the
            `field` you are searching

        field (str):
            the particular field you are searching with. Possible values
            are 'name', 'CID', and 'smiles'.'name' will search common '
            'names,CID will search the Pubchem Chemical Idenitification '
            'Numberswhich can be found on their website and smiles'
            ' searches for compounds with the entered smiles string.

        returns:
            conformers_ids (list):
                a list of the conformer IDs from PubChem, this is different
                than the CID numbers
    """
    suffix = 'conformers/JSON'
    url = f'{base_url}/compound/{field}/{search!s}/{suffix}'
    if mock_test:
        r = BytesIO(test_conformer_output)
    else:
        try:
            r = urllib.request.urlopen(url)
        except HTTPError as e:
            err = ValueError(
                f'the search term {search} could not be found for the field '
                f'{field}'
            )
            raise err from e
        except URLError as e:
            err = ValueError('Couldn\'t reach the pubchem servers, check'
                             ' your internet connection')
            raise err from e
    record = r.read().decode('utf-8')
    record = json.loads(record)
    return record['InformationList']['Information'][0]['ConformerID']


def pubchem_search(*args, mock_test=False, **kwargs):
    """
    Search PubChem for the field and search input on the argument passed in
    returning a PubchemData object. Note that only one argument may be passed
    in at a time.

    Parameters:
        name (str):
            the common name of the compound you're searching for
        cid (str or int):
            the cid of the compound you're searching for
        smiles (str):
            the smiles string of the compound you're searching for
        conformer (str or int):
            the conformer id of the compound you're searching for

    returns:
        result (PubchemData):
            a pubchem data object containing the information on the
            requested entry
    """

    search, field = analyze_input(*args, **kwargs)
    raw_pubchem = search_pubchem_raw(search, field, mock_test=mock_test)
    atoms, data = parse_pubchem_raw(raw_pubchem)
    return PubchemData(atoms, data)


def pubchem_conformer_search(*args, mock_test=False, **kwargs):
    """
    Search PubChem for all the conformers of a given compound.
    Note that only one argument may be passed in at a time.

    Parameters:
        see `ase.data.pubchem.pubchem_search`

    returns:
        conformers (list):
            a list containing the PubchemData objects of all the conformers
            for your search
    """

    search, field = analyze_input(*args, **kwargs)

    conformer_ids = available_conformer_search(search, field,
                                               mock_test=mock_test)
    return [
        pubchem_search(mock_test=mock_test, conformer=id_)
        for id_ in conformer_ids
    ]


def pubchem_atoms_search(*args, **kwargs):
    """
    Search PubChem for the field and search input on the argument passed in
    returning an atoms object.Note that only one argument may be passed
    in at a time.

    Parameters:
        see `ase.data.pubchem.pubchem_search`

    returns:
        atoms (ASE Atoms Object):
            an ASE Atoms object containing the information on the
            requested entry
    """
    return pubchem_search(*args, **kwargs).get_atoms()


def pubchem_atoms_conformer_search(*args, **kwargs):
    """
    Search PubChem for all the conformers of a given compound.
    Note that only one argument may be passed in at a time.

    Parameters:
        see `ase.data.pubchem.pubchem_search`

    returns:
        conformers (list):
            a list containing the atoms objects of all the conformers
            for your search
    """
    conformers = pubchem_conformer_search(*args, **kwargs)
    conformers = [conformer.get_atoms() for conformer in conformers]
    return conformers


test_output = b'222\n  -OEChem-10071914343D\n\n  4  3  0     0  0  0  0  0  0999 V2000\n    0.0000    0.0000    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0\n   -0.4417    0.2906    0.8711 H   0  0  0  0  0  0  0  0  0  0  0  0\n    0.7256    0.6896   -0.1907 H   0  0  0  0  0  0  0  0  0  0  0  0\n    0.4875   -0.8701    0.2089 H   0  0  0  0  0  0  0  0  0  0  0  0\n  1  2  1  0  0  0  0\n  1  3  1  0  0  0  0\n  1  4  1  0  0  0  0\nM  END\n> <PUBCHEM_COMPOUND_CID>\n222\n\n> <PUBCHEM_CONFORMER_RMSD>\n0.4\n\n> <PUBCHEM_CONFORMER_DIVERSEORDER>\n1\n\n> <PUBCHEM_MMFF94_PARTIAL_CHARGES>\n4\n1 -1.08\n2 0.36\n3 0.36\n4 0.36\n\n> <PUBCHEM_EFFECTIVE_ROTOR_COUNT>\n0\n\n> <PUBCHEM_PHARMACOPHORE_FEATURES>\n1\n1 1 cation\n\n> <PUBCHEM_HEAVY_ATOM_COUNT>\n1\n\n> <PUBCHEM_ATOM_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ATOM_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ISOTOPIC_ATOM_COUNT>\n0\n\n> <PUBCHEM_COMPONENT_COUNT>\n1\n\n> <PUBCHEM_CACTVS_TAUTO_COUNT>\n1\n\n> <PUBCHEM_CONFORMER_ID>\n000000DE00000001\n\n> <PUBCHEM_MMFF94_ENERGY>\n0\n\n> <PUBCHEM_FEATURE_SELFOVERLAP>\n5.074\n\n> <PUBCHEM_SHAPE_FINGERPRINT>\n260 1 18410856563934756871\n\n> <PUBCHEM_SHAPE_MULTIPOLES>\n15.6\n0.51\n0.51\n0.51\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n\n> <PUBCHEM_SHAPE_SELFOVERLAP>\n14.89\n\n> <PUBCHEM_SHAPE_VOLUME>\n15.6\n\n> <PUBCHEM_COORDINATE_TYPE>\n2\n5\n10\n\n$$$$\n'  # noqa
test_conformer_output = b'{\n  "InformationList": {\n    "Information": [\n      {\n        "CID": 222,\n        "ConformerID": [\n          "000000DE00000001"\n        ]\n      }\n    ]\n  }\n}\n'  # noqa