File: assignSubstructureFilters.py

package info (click to toggle)
rdkit 202503.1-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 220,160 kB
  • sloc: cpp: 399,240; python: 77,453; ansic: 25,517; java: 8,173; javascript: 4,005; sql: 2,389; yacc: 1,565; lex: 1,263; cs: 1,081; makefile: 580; xml: 229; fortran: 183; sh: 105
file content (143 lines) | stat: -rw-r--r-- 4,755 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import argparse
import operator
import sys
from collections import Counter, defaultdict, namedtuple

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import FilterCatalog, RDConfig, rdMolDescriptors

FilterMatch = namedtuple(
  'FilterMatch',
  ('SubstructureMatches', 'Min_N_O_filter', 'Frac_N_O', 'Covalent', 'SpecialMol', 'SeverityScore'))


# Build the filter catalog using the RDKit filterCatalog module
def buildFilterCatalog():

  inhousefilter = pd.read_csv(
    f'{RDConfig.RDContribDir}/NIBRSubstructureFilters/SubstructureFilter_HitTriaging_wPubChemExamples.csv'
  )
  inhouseFiltersCat = FilterCatalog.FilterCatalog()
  for i in range(inhousefilter.shape[0]):
    mincount = 1
    if inhousefilter['MIN_COUNT'][i] != 0:
      mincount = int(inhousefilter['MIN_COUNT'][i])
    pname = inhousefilter['PATTERN_NAME'][i]
    sname = inhousefilter['SET_NAME'][i]
    pname_final = '{0}_min({1})__{2}__{3}__{4}'.format(pname, mincount,
                                                       inhousefilter['SEVERITY_SCORE'][i],
                                                       inhousefilter['COVALENT'][i],
                                                       inhousefilter['SPECIAL_MOL'][i])
    fil = FilterCatalog.SmartsMatcher(pname_final, inhousefilter['SMARTS'][i], mincount)
    inhouseFiltersCat.AddEntry(FilterCatalog.FilterCatalogEntry(pname_final, fil))
    inhouseFiltersCat.GetEntry(i).SetProp('Scope', sname)
  return inhouseFiltersCat


# Assign substructure filters and fraction of Nitrogen and Oxygen atoms
def assignFilters(data, nameSmilesColumn='smiles'):

  results = []

  inhouseFiltersCat = buildFilterCatalog()

  NO_filter = '[#7,#8]'
  sma = Chem.MolFromSmarts(NO_filter, mergeHs=True)

  for smi in data[nameSmilesColumn]:
    qc, NO_filter, fracNO, co, sc, sm = [np.nan] * 6

# The following files require numpy and were explicitely checked for their compatibility.

    try:
      mol = Chem.MolFromSmiles(smi)

      # fraction of N and O atoms
      numHeavyAtoms = mol.GetNumHeavyAtoms()
      numNO = len(mol.GetSubstructMatches(Chem.MolFromSmarts('[#7,#8]')))
      fracNO = float(numNO) / numHeavyAtoms

      # all substructure filters
      entries = inhouseFiltersCat.GetMatches(mol)
      if len(list(entries)):
        # initialize empty lists
        fs, sev, cov, spm = ([] for _ in range(4))
        # get the matches
        for entry in entries:
          pname = entry.GetDescription()
          n, s, c, m = pname.split('__')
          fs.append(entry.GetProp("Scope") + '_' + n)
          sev.append(int(s))
          cov.append(int(c))
          spm.append(int(m))
        # concatenate all matching filters
        qc = ' | '.join(fs)
        # assign overall severity
        if sev.count(2):
          sc = 10
        else:
          sc = sum(sev)
        # get number of covalent flags and special molecule flags
        co = sum(cov)
        sm = sum(spm)
      # if non of the filters matches
      else:
        qc = 'no match'
        sc = 0
        co = 0
        sm = 0

      # special NO filter
      if not mol.HasSubstructMatch(sma):
        NO_filter = 'no_oxygen_or_nitrogen'
      else:
        NO_filter = 'no match'
    except Exception:
      print("Failed on compound {0}\n".format(smi))
      pass
    results.append(FilterMatch(qc, NO_filter, fracNO, co, sm, sc))
  return results


if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument('--data', type=str, required=True,
                      help='Please specify the path to your data file. Required format: csv')
  parser.add_argument('--smilesColumn', type=str, required=True,
                      help='Please specify the name of your SMILES column.')
  parser.add_argument('--result', type=str, required=True,
                      help='Please specify the name of your result file.')
  parser.add_argument('--verbose', type=bool, default=1, help='Generate output? Default: False')
  args = parser.parse_args()

  if args.verbose:
    print('---> Reading data')
  datafile = args.data
  try:
    data = pd.read_csv(datafile)
  except Exception:
    if args.verbose:
      print('Data could not be read. Please check your file.')
    sys.exit()

  smiCol = args.smilesColumn

  if args.verbose:
    print('---> Apply filters to data')
  try:
    results = assignFilters(data, nameSmilesColumn=smiCol)
  except Exception:
    if args.verbose:
      print('Smiles column does not exist. Please check.')
    sys.exit()

  df_tmp = pd.DataFrame.from_records(results, columns=FilterMatch._fields)

  data = data.merge(df_tmp, how='left', left_index=True, right_index=True)
  data.to_csv(args.result, index=False)

  if args.verbose:
    print('---> Done')