1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
|
#
# Copyright (C) 2019 Greg Landrum
#
# @@ All Rights Reserved @@
# This file is part of the RDKit.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the RDKit source tree.
#
import time
import gzip
import rdkit
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.RDLogger import logger
import argparse
logger = logger()
parser = argparse.ArgumentParser(
description="benchmark and test fingerprint screenout and substructure searching")
parser.add_argument("--validate", dest='validateResults', default=False, action='store_true',
help="validate that the screenout isn't missing anything")
parser.add_argument("--short", dest='doShort', default=False, action='store_true',
help="run a small subset of the molecules")
args = parser.parse_args()
ts = []
logger.info('mols from smiles')
mols = []
t1 = time.time()
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/chembl21_25K.pairs.txt.gz
with gzip.open('../Data/chembl21_25K.pairs.txt.gz', 'rb') as inf:
for line in inf:
line = line.decode().strip().split()
smi1 = line[1]
smi2 = line[3]
mols.append(Chem.MolFromSmiles(smi1))
mols.append(Chem.MolFromSmiles(smi2))
if args.doShort and len(mols) >= 1000:
break
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds, {len(mols)} mols')
logger.info('queries from smiles')
t1 = time.time()
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/zinc.frags.500.q.smi
frags = [Chem.MolFromSmiles(x.split()[0]) for x in open('../Data/zinc.frags.500.q.smi', 'r')]
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/zinc.leads.500.q.smi
leads = [Chem.MolFromSmiles(x.split()[0]) for x in open('../Data/zinc.leads.500.q.smi', 'r')]
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/fragqueries.q.txt
pieces = [Chem.MolFromSmiles(x) for x in open('../Data/fragqueries.q.txt', 'r')]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')
logger.info('generating pattern fingerprints for mols')
t1 = time.time()
mfps = [Chem.PatternFingerprint(m) for m in mols]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')
logger.info('generating pattern fingerprints for queries')
t1 = time.time()
fragsfps = [Chem.PatternFingerprint(m, 2048) for m in frags]
leadsfps = [Chem.PatternFingerprint(m, 2048) for m in leads]
piecesfps = [Chem.PatternFingerprint(m, 2048) for m in pieces]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')
for nm, qs, qfps in [('frags', frags, fragsfps), ('leads', leads, leadsfps), ('pieces', pieces, piecesfps)]:
logger.info(f'testing {nm} queries')
t1 = time.time()
nPossible = 0
nTested = 0
nFound = 0
nErrors = 0
for i, fragfp in enumerate(qfps):
for j, mfp in enumerate(mfps):
nPossible += 1
if args.validateResults:
matched = mols[j].HasSubstructMatch(qs[i])
fpMatch = DataStructs.AllProbeBitsMatch(fragfp, mfp)
if fpMatch:
nTested += 1
if matched:
nFound += 1
if not fpMatch:
nErrors += 1
logger.error(f"ERROR: mol {j} query {i}")
else:
if DataStructs.AllProbeBitsMatch(fragfp, mfp):
nTested += 1
if mols[j].HasSubstructMatch(qs[i]):
nFound += 1
t2 = time.time()
ts.append(t2 - t1)
logger.info(
f'Results{len(ts)}: {t2-t1 : .2f} seconds. {nTested} tested ({nTested/nPossible :.4f} of total), {nFound} found, {nFound/nTested : .2f} accuracy. {nErrors} errors.')
print(f"| {rdkit.__version__} | {' | '.join(['%.1f' % x for x in ts])} |")
|