File: build_sascore_db.py

package info (click to toggle)
rdkit 202503.6-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 222,024 kB
  • sloc: cpp: 411,111; python: 78,482; ansic: 26,181; java: 8,285; javascript: 4,404; sql: 2,393; yacc: 1,626; lex: 1,267; cs: 1,090; makefile: 580; xml: 229; fortran: 183; sh: 121
file content (268 lines) | stat: -rw-r--r-- 9,077 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#!/usr/bin/python3
#
# File to build a database of fragment scores for the Synthetic
# Accessibility score of Ertl and Schuffenhauer
# Journal of Cheminformatics 1:8 (2009)
# http://www.jcheminf.com/content/1/1/8
#
# It's so you can update the fragment scores to reflect improvements
# in chemistry.  For example, building with a recent SureChEMBL dataset
# substantially reduced all the scores for the original training compounds.

import argparse
import gzip
import math
import pickle

try:
  import seaborn.objects as so
  HAVE_SEABORN = True
except ModuleNotFoundError:
  HAVE_SEABORN = False
import sys

from collections import defaultdict
from pathlib import Path

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator


def parse_args(cli_args):
  """
        Use argparse module to parse the command line arguments.

        Returns:
            (Namespace object): Parsed arguments in argparse Namespace
                                object.
        """
  parser = argparse.ArgumentParser(description='Generate fragment scores'
                                   ' data for Synthetic'
                                   ' Accessibility (SA) Score.')
  parser.add_argument('-S', '--score-file', dest='score_file', required=True,
                      help='Name of the output scores file.')
  in_file_args = parser.add_mutually_exclusive_group(required=True)
  in_file_args.add_argument(
    '-I', '--input-file', dest='input_file', help='Name of molecule file containing input'
    ' molecules to be processed.')
  in_file_args.add_argument('--input-freqs-pickle', dest='in_freq_pickle',
                            help='Name of previously generated frequencies'
                            ' pickle file.')
  parser.add_argument(
    '-P', '--plot-file', dest='plot_file', default='dist.svg',
    help='Name of file for plot of distribution of'
    ' frequencies. Must use an extension recognised by'
    ' matplotlib.  Default=%(default)s.')
  parser.add_argument(
    '--output-freqs-pickle', dest='out_freq_pickle',
    help='If this is given, the raw frequencies from the'
    ' molecules will be pickled to this file.')

  args = parser.parse_args(cli_args)
  return args


def create_mol_supplier(infile):
  """

    Args:
        infile (str): must end .smi, .sdf or .sdf.gz

    Returns:
        ForwardSDMolSupplier or None
    """
  inpath = Path(infile)
  sfx = inpath.suffix
  gzipped = False
  if sfx == '.gz':
    suffixes = inpath.suffixes
    gzipped = True
    sfx = suffixes[-2]

  if sfx != '.smi' and sfx != '.sdf' and sfx != '.mol':
    print(f'ERROR : input must be a SMILES, SDF, MOL or gzipped SDF'
          f' or MOL, you gave {infile}  Aborting.')
    return None

  if sfx == '.smi':
    return Chem.SmilesMolSupplier(infile, titleLine=False, sanitize=False)
  else:
    try:
      if gzipped:
        inf = gzip.open(infile)
        return Chem.ForwardSDMolSupplier(inf, sanitize=False)
      else:
        return Chem.ForwardSDMolSupplier(infile, sanitize=False)
    except (OSError, FileNotFoundError):
      print(f'ERROR : failed to open file {infile}.  Not Found.')
      return None


def extractFragments(molfile):
  """
    Read the input file and return a list of frequency counts of
    radius 2 Morgan fingerprints.  Sorted in descending order of
    frequency.  First element in each tuple is the fragment number,
    second is the count.
    """
  suppl = create_mol_supplier(molfile)
  if suppl is None:
    return None

  freqDict = defaultdict(int)
  mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2)

  for i, mol in enumerate(suppl):
    if i and not i % 1000:
      print(f'Done {i} molecules.', flush=True)
    if mol is None or not mol:
      continue
    # print(f'Next mol : {Chem.MolToSmiles(mol)}  {mol.GetProp("_Name")}', flush=True)

    sfp = mfpgen.GetSparseCountFingerprint(mol)
    for bitId in sfp.GetNonzeroElements():
      freqDict[bitId] += sfp[bitId]

  freqs = list(freqDict.items())
  freqs.sort(key=lambda f: f[1], reverse=True)

  return freqs


def plot_freqs(freqs, output_file):
  """
    Do a log frequency plot of the data.  Assume it is already sorted.
    """
  y_freqs = [f[1] for f in freqs]
  (so.Plot(x=range(len(y_freqs)),
           y=y_freqs).add(so.Line()).scale(y='log').label(x='fragment number',
                                                          y='log frequency').save(output_file))


def score_fragments(freqs):
  """
    Calculate the score for each fragment.  This is the "logarithm of
    the ratio between the actual fragment count and the number of
    fragments forming 80% of all fragments in the database"
    Which is a bit ambiguous to my reading.  Reverse-engineering the
    RDKit pickle file of scores shows that it's a log10, not natural
    log, and the denominator is 1338.
    The denominator thus seems to be summing the total frequencies
    and then finding the number of fragments needed to give a sum of
    frequencies that's 80% of that, summing in descending order of
    frequency.  For the 22M compounds of SureChEMBL as of April 2023,
    that gives a denominator of 1336.
    Returns a dict of scores, where the key is a score, and the items
    are lists of fragments with that score.  This is the original
    format used for the pickle file in the original RDKit contrib
    version of this score.  The score is taken to 4 decimal places
    and stringified.
    freqs contains tuples of fragment id and frequency, and is assumed
    already to be sorted in descending frequency.
    """
  tot_freqs = 0
  for f in freqs:
    tot_freqs += f[1]

  tot_freqs_80 = (tot_freqs * 8) // 10
  tf = 0
  for i, freq in enumerate(freqs):
    tf += freq[1]
    if tf < tot_freqs_80:
      freq_80_num = i
  print(f'tot_freqs : {tot_freqs} : tot_freqs_80 : {tot_freqs_80}'
        f'  freq_80_num = {freq_80_num}')

  scores_dict = defaultdict(list)
  for i, freq in enumerate(freqs):
    score = f'{math.log10(freq[1] / freq_80_num):.4f}'.strip()
    scores_dict[score].append(freq[0])

  return scores_dict


def save_scores_dict(scores_dict, score_file):
  """
    The pickle appears to be list of lists, each one headed by a score
    followed by the fragments with that score.
    """
  out_lists = []
  for score, frags in scores_dict.items():
    out_lists.append([score] + frags)
  # for ol in out_lists:
  #     print(f'{len(ol)} : {ol[:10]}')
  pickle.dump(out_lists, gzip.open(score_file, 'wb'))


def main(cli_args):
  args = parse_args(cli_args)
  if args.input_file is not None:
    freqs = extractFragments(args.input_file)
  else:
    print(f'reading frequencies from {args.in_freq_pickle}')
    with open(args.in_freq_pickle, 'rb') as f:
      freqs = pickle.load(f)

  if freqs is None:
    return False

  print(freqs[:10])
  print(freqs[-10:])
  num_1000 = 0
  num_1 = 0
  for f in freqs:
    if f[1] > 1000:
      num_1000 += 1
    if f[1] == 1:
      num_1 += 1
  print(f'num above 1000 : {num_1000}  num 1 : {num_1}')

  print(f'number of fragments : {len(freqs)}')
  if HAVE_SEABORN:
    plot_freqs(freqs, args.plot_file)
  else:
    print(f'No plot file {args.plot_file} because no seaborn found.')
  if args.out_freq_pickle is not None:
    with open(args.out_freq_pickle, 'wb') as f:
      pickle.dump(freqs, f)

  scores_dict = score_fragments(freqs)
  save_scores_dict(scores_dict, args.score_file)

  return True


if __name__ == '__main__':
  sys.exit(not main(sys.argv[1:]))


#
#  Copyright (c) 2023, Glysade LLC
#  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials provided
#       with the distribution.
#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
#       nor the names of its contributors may be used to endorse or promote
#       products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#