File: ClusterMols.py

package info (click to toggle)
rdkit 202009.4-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 129,624 kB
  • sloc: cpp: 288,030; python: 75,571; java: 6,999; ansic: 5,481; sql: 1,968; yacc: 1,842; lex: 1,254; makefile: 572; javascript: 461; xml: 229; fortran: 183; sh: 134; cs: 93
file content (190 lines) | stat: -rwxr-xr-x 5,671 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#
#  Copyright (c) 2003-2006 Rational Discovery LLC
#
#   @@ All Rights Reserved @@
#  This file is part of the RDKit.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the RDKit source tree.
#
""" utility functionality for clustering molecules using fingerprints
 includes a command line app for clustering


Sample Usage:
  python ClusterMols.py  -d data.gdb -t daylight_sig \
    --idName="CAS_TF" -o clust1.pkl \
    --actTable="dop_test" --actName="moa_quant"

"""


import numpy

from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols, MolSimilarity
from rdkit.ML.Cluster import Murtagh
import pickle

message = FingerprintMols.message
error = FingerprintMols.error


def GetDistanceMatrix(data, metric, isSimilarity=1):
  """ data should be a list of tuples with fingerprints in position 1
   (the rest of the elements of the tuple are not important)

    Returns the symmetric distance matrix
    (see ML.Cluster.Resemblance for layout documentation)

  """
  nPts = len(data)
  res = numpy.zeros((nPts * (nPts - 1) // 2), numpy.float)
  nSoFar = 0
  for col in range(1, nPts):
    for row in range(col):
      fp1 = data[col][1]
      fp2 = data[row][1]
      if fp1.GetNumBits() > fp2.GetNumBits():
        fp1 = DataStructs.FoldFingerprint(fp1, fp1.GetNumBits() / fp2.GetNumBits())
      elif fp2.GetNumBits() > fp1.GetNumBits():
        fp2 = DataStructs.FoldFingerprint(fp2, fp2.GetNumBits() / fp1.GetNumBits())
      sim = metric(fp1, fp2)
      if isSimilarity:
        sim = 1. - sim
      res[nSoFar] = sim
      nSoFar += 1
  return res


def ClusterPoints(data, metric, algorithmId, haveLabels=False, haveActs=True,
                  returnDistances=False):
  message('Generating distance matrix.\n')
  dMat = GetDistanceMatrix(data, metric)
  message('Clustering\n')
  clustTree = Murtagh.ClusterData(dMat, len(data), algorithmId, isDistData=1)[0]
  acts = []
  if haveActs and len(data[0]) > 2:
    # we've got activities... use them:
    acts = [int(x[2]) for x in data]

  if not haveLabels:
    labels = ['Mol: %s' % str(x[0]) for x in data]
  else:
    labels = [x[0] for x in data]
  clustTree._ptLabels = labels
  if acts:
    clustTree._ptValues = acts
  for pt in clustTree.GetPoints():
    idx = pt.GetIndex() - 1
    pt.SetName(labels[idx])
    if acts:
      try:
        pt.SetData(int(acts[idx]))
      except Exception:
        pass
  if not returnDistances:
    return clustTree
  else:
    return clustTree, dMat


def ClusterFromDetails(details):
  """ Returns the cluster tree

  """
  data = MolSimilarity.GetFingerprints(details)
  if details.maxMols > 0:
    data = data[:details.maxMols]
  if details.outFileName:
    try:
      outF = open(details.outFileName, 'wb+')
    except IOError:
      error("Error: could not open output file %s for writing\n" % (details.outFileName))
      return None
  else:
    outF = None

  if not data:
    return None

  clustTree = ClusterPoints(data, details.metric, details.clusterAlgo, haveLabels=0, haveActs=1)
  if outF:
    pickle.dump(clustTree, outF)
  return clustTree


_usageDoc = """
Usage: ClusterMols.py [args] <fName>

  If <fName> is provided and no tableName is specified (see below),
  data will be read from the text file <fName>.  Text files delimited
  with either commas (extension .csv) or tabs (extension .txt) are
  supported.

  Command line arguments are:

    - -d _dbName_: set the name of the database from which
      to pull input fingerprint information.

    - -t _tableName_: set the name of the database table
      from which to pull input fingerprint information

    - --idName=val: sets the name of the id column in the input
      database.  Default is *ID*.

    - -o _outFileName_:  name of the output file (output will
      be a pickle (.pkl) file with the cluster tree)

    - --actTable=val: name of table containing activity values
     (used to color points in the cluster tree).

    - --actName=val: name of column with activities in the activity
      table.  The values in this column should either be integers or
      convertible into integers.

    - --SLINK: use the single-linkage clustering algorithm
      (default is Ward's minimum variance)

    - --CLINK: use the complete-linkage clustering algorithm
      (default is Ward's minimum variance)

    - --UPGMA: use the group-average clustering algorithm
      (default is Ward's minimum variance)

    - --dice: use the DICE similarity metric instead of Tanimoto

    - --cosine: use the cosine similarity metric instead of Tanimoto

    - --fpColName=val: name to use for the column which stores
      fingerprints (in pickled format) in the input db table.
      Default is *AutoFragmentFP*

    - --minPath=val:  minimum path length to be included in
      fragment-based fingerprints. Default is *2*.

    - --maxPath=val:  maximum path length to be included in
      fragment-based fingerprints. Default is *7*.

    - --nBitsPerHash: number of bits to be set in the output
      fingerprint for each fragment. Default is *4*.

    - --discrim: use of path-based discriminators to hash bits.
      Default is *false*.

    - -V: include valence information in the fingerprints
      Default is *false*.

    - -H: include Hs in the fingerprint
      Default is *false*.

    - --useMACCS: use the public MACCS keys to do the fingerprinting
      (instead of a daylight-type fingerprint)


"""
if __name__ == '__main__':
  message("This is ClusterMols\n\n")
  FingerprintMols._usageDoc = _usageDoc
  details = FingerprintMols.ParseArgs()
  ClusterFromDetails(details)