File: rfrag.py

package info (click to toggle)
rdkit 201809.1%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 123,688 kB
  • sloc: cpp: 230,509; python: 70,501; java: 6,329; ansic: 5,427; sql: 1,899; yacc: 1,739; lex: 1,243; makefile: 445; xml: 229; fortran: 183; sh: 123; cs: 93
file content (241 lines) | stat: -rw-r--r-- 8,125 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# Copyright (c) 2013, GlaxoSmithKline Research & Development Ltd.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials provided
#       with the distribution.
#     * Neither the name of GlaxoSmithKline Research & Development Ltd.
#       nor the names of its contributors may be used to endorse or promote
#       products derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Jameed Hussain, July 2013
#
# Modifications and optimizations by Greg Landrum, July 2015
#
from __future__ import print_function
import sys
import re
from rdkit import Chem
from rdkit.Chem import rdMMPA


def find_correct(f_array):

  core = ""
  side_chains = ""

  for f in f_array:
    attachments = f.count("*")
    if (attachments == 1):
      side_chains = "%s.%s" % (side_chains, f)
    else:
      core = f

  side_chains = side_chains.lstrip('.')

  #cansmi the side chains
  temp = Chem.MolFromSmiles(side_chains)
  side_chains = Chem.MolToSmiles(temp, isomericSmiles=True)

  #and cansmi the core
  temp = Chem.MolFromSmiles(core)
  core = Chem.MolToSmiles(temp, isomericSmiles=True)

  return core, side_chains


def delete_bonds(smi, id, mol, bonds, out):

  #use the same parent mol object and create editable mol
  em = Chem.EditableMol(mol)

  #loop through the bonds to delete
  isotope = 0
  isotope_track = {}
  for i in bonds:
    isotope += 1
    #remove the bond
    em.RemoveBond(i[0], i[1])

    #now add attachement points
    newAtomA = em.AddAtom(Chem.Atom(0))
    em.AddBond(i[0], newAtomA, Chem.BondType.SINGLE)

    newAtomB = em.AddAtom(Chem.Atom(0))
    em.AddBond(i[1], newAtomB, Chem.BondType.SINGLE)

    #keep track of where to put isotopes
    isotope_track[newAtomA] = isotope
    isotope_track[newAtomB] = isotope

  #should be able to get away without sanitising mol
  #as the existing valencies/atoms not changed
  modifiedMol = em.GetMol()

  #canonical smiles can be different with and without the isotopes
  #hence to keep track of duplicates use fragmented_smi_noIsotopes
  fragmented_smi_noIsotopes = Chem.MolToSmiles(modifiedMol, isomericSmiles=True)

  valid = True
  fragments = fragmented_smi_noIsotopes.split(".")

  #check if its a valid triple cut
  if (isotope == 3):
    valid = False
    for f in fragments:
      matchObj = re.search('\*.*\*.*\*', f)
      if matchObj:
        valid = True
        break

  if valid:
    if (isotope == 1):
      fragmented_smi_noIsotopes = re.sub('\[\*\]', '[*:1]', fragmented_smi_noIsotopes)

      fragments = fragmented_smi_noIsotopes.split(".")

      #print fragmented_smi_noIsotopes
      s1 = Chem.MolFromSmiles(fragments[0])
      s2 = Chem.MolFromSmiles(fragments[1])

      #need to cansmi again as smiles can be different
      output = '%s,%s,,%s.%s' % (smi, id, Chem.MolToSmiles(s1, isomericSmiles=True),
                                 Chem.MolToSmiles(s2, isomericSmiles=True))
      if ((output in out) == False):
        out.add(output)

    elif (isotope >= 2):
      #add the isotope labels
      for key in isotope_track:
        #to add isotope lables
        modifiedMol.GetAtomWithIdx(key).SetIsotope(isotope_track[key])
      fragmented_smi = Chem.MolToSmiles(modifiedMol, isomericSmiles=True)

      #change the isotopes into labels - currently can't add SMARTS or labels to mol
      fragmented_smi = re.sub('\[1\*\]', '[*:1]', fragmented_smi)
      fragmented_smi = re.sub('\[2\*\]', '[*:2]', fragmented_smi)
      fragmented_smi = re.sub('\[3\*\]', '[*:3]', fragmented_smi)

      fragments = fragmented_smi.split(".")

      #identify core/side chains and cansmi them
      core, side_chains = find_correct(fragments)

      #now change the labels on sidechains and core
      #to get the new labels, cansmi the dot-disconnected side chains
      #the first fragment in the side chains has attachment label 1, 2nd: 2, 3rd: 3
      #then change the labels accordingly in the core

      #this is required by the indexing script, as the side-chains are "keys" in the index
      #this ensures the side-chains always have the same numbering

      isotope_track = {}
      side_chain_fragments = side_chains.split(".")

      for s in range(len(side_chain_fragments)):
        matchObj = re.search('\[\*\:([123])\]', side_chain_fragments[s])
        if matchObj:
          #add to isotope_track with key: old_isotope, value:
          isotope_track[matchObj.group(1)] = str(s + 1)

      #change the labels if required
      if (isotope_track['1'] != '1'):
        core = re.sub('\[\*\:1\]', '[*:XX' + isotope_track['1'] + 'XX]', core)
        side_chains = re.sub('\[\*\:1\]', '[*:XX' + isotope_track['1'] + 'XX]', side_chains)
      if (isotope_track['2'] != '2'):
        core = re.sub('\[\*\:2\]', '[*:XX' + isotope_track['2'] + 'XX]', core)
        side_chains = re.sub('\[\*\:2\]', '[*:XX' + isotope_track['2'] + 'XX]', side_chains)

      if (isotope == 3):
        if (isotope_track['3'] != '3'):
          core = re.sub('\[\*\:3\]', '[*:XX' + isotope_track['3'] + 'XX]', core)
          side_chains = re.sub('\[\*\:3\]', '[*:XX' + isotope_track['3'] + 'XX]', side_chains)

      #now remove the XX
      core = re.sub('XX', '', core)
      side_chains = re.sub('XX', '', side_chains)

      output = '%s,%s,%s,%s' % (smi, id, core, side_chains)
      if ((output in out) == False):
        out.add(output)


def fragment_mol(smi, cid):
  mol = Chem.MolFromSmiles(smi)

  #different cuts can give the same fragments
  #to use outlines to remove them
  outlines = set()

  if (mol == None):
    sys.stderr.write("Can't generate mol for: %s\n" % (smi))
  else:
    frags = rdMMPA.FragmentMol(mol, pattern="[#6+0;!$(*=,#[!#6])]!@!=!#[*]", resultsAsMols=False)
    for core, chains in frags:
      output = '%s,%s,%s,%s' % (smi, cid, core, chains)
      if (not (output in outlines)):
        outlines.add(output)
    if not outlines:
      # for molecules with no cuts, output the parent molecule itself
      outlines.add('%s,%s,,' % (smi,cid))

  return outlines


if __name__ == '__main__':

  if (len(sys.argv) >= 2):
    print("Program that fragments a user input set of smiles.")
    print(
      "The program enumerates every single,double and triple acyclic single bond cuts in a molecule.\n")
    print("USAGE: ./rfrag.py <file_of_smiles")
    print("Format of smiles file: SMILES ID (space separated)")
    print("Output: whole mol smiles,ID,core,context\n")
    sys.exit(1)

  #read the STDIN
  for line in sys.stdin:

    line = line.rstrip()

    line_fields = re.split('\s|,', line)
    smiles = line_fields[0]
    cmpd_id = line_fields[1]

    #returns a set containing the output
    o = fragment_mol(smiles, cmpd_id)
    for l in o:
      print(l)

"""
optimization work.

Original:
~/RDKit_git/Contrib/mmpa > time head -100 ../../Data/Zinc/zim.smi | python rfrag.py > zim.frags.o

real	0m9.752s
user	0m9.704s
sys	0m0.043s


"""