1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
import argparse
import operator
import sys
from collections import Counter, defaultdict, namedtuple
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import FilterCatalog, RDConfig, rdMolDescriptors
FilterMatch = namedtuple(
'FilterMatch',
('SubstructureMatches', 'Min_N_O_filter', 'Frac_N_O', 'Covalent', 'SpecialMol', 'SeverityScore'))
# Build the filter catalog using the RDKit filterCatalog module
def buildFilterCatalog():
inhousefilter = pd.read_csv(
f'{RDConfig.RDContribDir}/NIBRSubstructureFilters/SubstructureFilter_HitTriaging_wPubChemExamples.csv'
)
inhouseFiltersCat = FilterCatalog.FilterCatalog()
for i in range(inhousefilter.shape[0]):
mincount = 1
if inhousefilter['MIN_COUNT'][i] != 0:
mincount = int(inhousefilter['MIN_COUNT'][i])
pname = inhousefilter['PATTERN_NAME'][i]
sname = inhousefilter['SET_NAME'][i]
pname_final = '{0}_min({1})__{2}__{3}__{4}'.format(pname, mincount,
inhousefilter['SEVERITY_SCORE'][i],
inhousefilter['COVALENT'][i],
inhousefilter['SPECIAL_MOL'][i])
fil = FilterCatalog.SmartsMatcher(pname_final, inhousefilter['SMARTS'][i], mincount)
inhouseFiltersCat.AddEntry(FilterCatalog.FilterCatalogEntry(pname_final, fil))
inhouseFiltersCat.GetEntry(i).SetProp('Scope', sname)
return inhouseFiltersCat
# Assign substructure filters and fraction of Nitrogen and Oxygen atoms
def assignFilters(data, nameSmilesColumn='smiles'):
results = []
inhouseFiltersCat = buildFilterCatalog()
NO_filter = '[#7,#8]'
sma = Chem.MolFromSmarts(NO_filter, mergeHs=True)
for smi in data[nameSmilesColumn]:
qc, NO_filter, fracNO, co, sc, sm = [np.nan] * 6
# The following files require numpy and were explicitely checked for their compatibility.
try:
mol = Chem.MolFromSmiles(smi)
# fraction of N and O atoms
numHeavyAtoms = mol.GetNumHeavyAtoms()
numNO = len(mol.GetSubstructMatches(Chem.MolFromSmarts('[#7,#8]')))
fracNO = float(numNO) / numHeavyAtoms
# all substructure filters
entries = inhouseFiltersCat.GetMatches(mol)
if len(list(entries)):
# initialize empty lists
fs, sev, cov, spm = ([] for _ in range(4))
# get the matches
for entry in entries:
pname = entry.GetDescription()
n, s, c, m = pname.split('__')
fs.append(entry.GetProp("Scope") + '_' + n)
sev.append(int(s))
cov.append(int(c))
spm.append(int(m))
# concatenate all matching filters
qc = ' | '.join(fs)
# assign overall severity
if sev.count(2):
sc = 10
else:
sc = sum(sev)
# get number of covalent flags and special molecule flags
co = sum(cov)
sm = sum(spm)
# if non of the filters matches
else:
qc = 'no match'
sc = 0
co = 0
sm = 0
# special NO filter
if not mol.HasSubstructMatch(sma):
NO_filter = 'no_oxygen_or_nitrogen'
else:
NO_filter = 'no match'
except Exception:
print("Failed on compound {0}\n".format(smi))
pass
results.append(FilterMatch(qc, NO_filter, fracNO, co, sm, sc))
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, required=True,
help='Please specify the path to your data file. Required format: csv')
parser.add_argument('--smilesColumn', type=str, required=True,
help='Please specify the name of your SMILES column.')
parser.add_argument('--result', type=str, required=True,
help='Please specify the name of your result file.')
parser.add_argument('--verbose', type=bool, default=1, help='Generate output? Default: False')
args = parser.parse_args()
if args.verbose:
print('---> Reading data')
datafile = args.data
try:
data = pd.read_csv(datafile)
except Exception:
if args.verbose:
print('Data could not be read. Please check your file.')
sys.exit()
smiCol = args.smilesColumn
if args.verbose:
print('---> Apply filters to data')
try:
results = assignFilters(data, nameSmilesColumn=smiCol)
except Exception:
if args.verbose:
print('Smiles column does not exist. Please check.')
sys.exit()
df_tmp = pd.DataFrame.from_records(results, columns=FilterMatch._fields)
data = data.merge(df_tmp, how='left', left_index=True, right_index=True)
data.to_csv(args.result, index=False)
if args.verbose:
print('---> Done')
|