File: fingerprint_screenout.py

package info (click to toggle)
rdkit 202009.4-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 129,624 kB
  • sloc: cpp: 288,030; python: 75,571; java: 6,999; ansic: 5,481; sql: 1,968; yacc: 1,842; lex: 1,254; makefile: 572; javascript: 461; xml: 229; fortran: 183; sh: 134; cs: 93
file content (110 lines) | stat: -rw-r--r-- 4,158 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#
#  Copyright (C) 2019 Greg Landrum
#
#   @@ All Rights Reserved @@
#  This file is part of the RDKit.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the RDKit source tree.
#
import time
import gzip
import rdkit
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.RDLogger import logger
import argparse

logger = logger()

parser = argparse.ArgumentParser(
    description="benchmark and test fingerprint screenout and substructure searching")
parser.add_argument("--validate", dest='validateResults', default=False, action='store_true',
                    help="validate that the screenout isn't missing anything")
parser.add_argument("--short", dest='doShort', default=False, action='store_true',
                    help="run a small subset of the molecules")
args = parser.parse_args()

ts = []

logger.info('mols from smiles')
mols = []
t1 = time.time()
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/chembl21_25K.pairs.txt.gz
with gzip.open('../Data/chembl21_25K.pairs.txt.gz', 'rb') as inf:
    for line in inf:
        line = line.decode().strip().split()
        smi1 = line[1]
        smi2 = line[3]
        mols.append(Chem.MolFromSmiles(smi1))
        mols.append(Chem.MolFromSmiles(smi2))
        if args.doShort and len(mols) >= 1000:
            break
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds, {len(mols)} mols')


logger.info('queries from smiles')
t1 = time.time()
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/zinc.frags.500.q.smi
frags = [Chem.MolFromSmiles(x.split()[0]) for x in open('../Data/zinc.frags.500.q.smi', 'r')]
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/zinc.leads.500.q.smi
leads = [Chem.MolFromSmiles(x.split()[0]) for x in open('../Data/zinc.leads.500.q.smi', 'r')]
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/fragqueries.q.txt
pieces = [Chem.MolFromSmiles(x) for x in open('../Data/fragqueries.q.txt', 'r')]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')


logger.info('generating pattern fingerprints for mols')
t1 = time.time()
mfps = [Chem.PatternFingerprint(m) for m in mols]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')


logger.info('generating pattern fingerprints for queries')
t1 = time.time()
fragsfps = [Chem.PatternFingerprint(m, 2048) for m in frags]
leadsfps = [Chem.PatternFingerprint(m, 2048) for m in leads]
piecesfps = [Chem.PatternFingerprint(m, 2048) for m in pieces]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')

for nm, qs, qfps in [('frags', frags, fragsfps), ('leads', leads, leadsfps), ('pieces', pieces, piecesfps)]:
    logger.info(f'testing {nm} queries')
    t1 = time.time()
    nPossible = 0
    nTested = 0
    nFound = 0
    nErrors = 0
    for i, fragfp in enumerate(qfps):
        for j, mfp in enumerate(mfps):
            nPossible += 1
            if args.validateResults:
                matched = mols[j].HasSubstructMatch(qs[i])
                fpMatch = DataStructs.AllProbeBitsMatch(fragfp, mfp)
                if fpMatch:
                    nTested += 1
                if matched:
                    nFound += 1
                    if not fpMatch:
                        nErrors += 1
                        logger.error(f"ERROR: mol {j} query {i}")
            else:
                if DataStructs.AllProbeBitsMatch(fragfp, mfp):
                    nTested += 1
                    if mols[j].HasSubstructMatch(qs[i]):
                        nFound += 1
    t2 = time.time()
    ts.append(t2 - t1)
    logger.info(
        f'Results{len(ts)}: {t2-t1 : .2f} seconds. {nTested} tested ({nTested/nPossible :.4f} of total), {nFound} found, {nFound/nTested : .2f} accuracy. {nErrors} errors.')


print(f"| {rdkit.__version__} | {' | '.join(['%.1f' % x for x in ts])} |")