File: fingerprint_screenout.py

package info (click to toggle)
rdkit 202503.6-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 222,000 kB
  • sloc: cpp: 411,111; python: 78,482; ansic: 26,181; java: 8,285; javascript: 4,404; sql: 2,393; yacc: 1,626; lex: 1,267; cs: 1,090; makefile: 581; xml: 229; fortran: 183; sh: 121
file content (108 lines) | stat: -rw-r--r-- 3,956 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#
#  Copyright (C) 2019 Greg Landrum
#
#   @@ All Rights Reserved @@
#  This file is part of the RDKit.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the RDKit source tree.
#
import argparse
import gzip
import time

import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.RDLogger import logger

logger = logger()

parser = argparse.ArgumentParser(
  description="benchmark and test fingerprint screenout and substructure searching")
parser.add_argument("--validate", dest='validateResults', default=False, action='store_true',
                    help="validate that the screenout isn't missing anything")
parser.add_argument("--short", dest='doShort', default=False, action='store_true',
                    help="run a small subset of the molecules")
args = parser.parse_args()

ts = []

logger.info('mols from smiles')
mols = []
t1 = time.time()
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/chembl21_25K.pairs.txt.gz
with gzip.open('../Data/chembl21_25K.pairs.txt.gz', 'rb') as inf:
  for line in inf:
    line = line.decode().strip().split()
    smi1 = line[1]
    smi2 = line[3]
    mols.append(Chem.MolFromSmiles(smi1))
    mols.append(Chem.MolFromSmiles(smi2))
    if args.doShort and len(mols) >= 1000:
      break
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds, {len(mols)} mols')

logger.info('queries from smiles')
t1 = time.time()
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/zinc.frags.500.q.smi
frags = [Chem.MolFromSmiles(x.split()[0]) for x in open('../Data/zinc.frags.500.q.smi', 'r')]
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/zinc.leads.500.q.smi
leads = [Chem.MolFromSmiles(x.split()[0]) for x in open('../Data/zinc.leads.500.q.smi', 'r')]
# find this file here: https://raw.githubusercontent.com/greglandrum/rdkit_blog/master/data/fragqueries.q.txt
pieces = [Chem.MolFromSmiles(x) for x in open('../Data/fragqueries.q.txt', 'r')]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')

logger.info('generating pattern fingerprints for mols')
t1 = time.time()
mfps = [Chem.PatternFingerprint(m) for m in mols]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')

logger.info('generating pattern fingerprints for queries')
t1 = time.time()
fragsfps = [Chem.PatternFingerprint(m, 2048) for m in frags]
leadsfps = [Chem.PatternFingerprint(m, 2048) for m in leads]
piecesfps = [Chem.PatternFingerprint(m, 2048) for m in pieces]
t2 = time.time()
ts.append(t2 - t1)
logger.info(f'Results{len(ts)}: {t2-t1 : .2f} seconds')

for nm, qs, qfps in [('frags', frags, fragsfps), ('leads', leads, leadsfps),
                     ('pieces', pieces, piecesfps)]:
  logger.info(f'testing {nm} queries')
  t1 = time.time()
  nPossible = 0
  nTested = 0
  nFound = 0
  nErrors = 0
  for i, fragfp in enumerate(qfps):
    for j, mfp in enumerate(mfps):
      nPossible += 1
      if args.validateResults:
        matched = mols[j].HasSubstructMatch(qs[i])
        fpMatch = DataStructs.AllProbeBitsMatch(fragfp, mfp)
        if fpMatch:
          nTested += 1
        if matched:
          nFound += 1
          if not fpMatch:
            nErrors += 1
            logger.error(f"ERROR: mol {j} query {i}")
      else:
        if DataStructs.AllProbeBitsMatch(fragfp, mfp):
          nTested += 1
          if mols[j].HasSubstructMatch(qs[i]):
            nFound += 1
  t2 = time.time()
  ts.append(t2 - t1)
  logger.info(
    f'Results{len(ts)}: {t2-t1 : .2f} seconds. {nTested} tested ({nTested/nPossible :.4f} of total), {nFound} found, {nFound/nTested : .2f} accuracy. {nErrors} errors.'
  )

print(f"| {rdkit.__version__} | {' | '.join(['%.1f' % x for x in ts])} |")