File: BitClusterer.py

package info (click to toggle)
rdkit 201203-3
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 37,840 kB
  • sloc: cpp: 93,902; python: 51,897; java: 5,192; ansic: 3,497; xml: 2,499; sql: 1,641; yacc: 1,518; lex: 1,076; makefile: 325; fortran: 183; sh: 153; cs: 51
file content (88 lines) | stat: -rwxr-xr-x 2,682 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#
#  Copyright (C) 2000-2008  Greg Landrum and Rational Discovery LLC
#

from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv
if rdsimdiv is None:
   raise ImportError,'rdSimDivPickers not built'
from rdkit import DataStructs
import numpy

class BitClusterer(object):
    """ Class to cluster a set of bits based on their correllation

    The correlation matrix is first built using by reading the fingerprints
    from a database or a list of fingerprints
    """

    def __init__(self, idList, nCluster, type=rdsimdiv.ClusterMethod.WARD):
        self._clusters = []
        self._bidList = idList
        #self._matGen = BitCorrelationMatGenerator(idList)
        self._nClusters = nCluster
        self._type = type

    def ClusterBits(self, corrMat) :
        # clutering code actually needs distances so, take 1/val for each element in corMat
        distMat = 1/corrMat

        pkr = rdsimdiv.HierarchicalClusterPicker(self._type)
        
        cls = pkr.Cluster(distMat, len(self._bidList), self._nClusters)
        # map the clusters to the actual bit ids
        self._clusters = []
        for cl in cls :
            bcls = []
            for i in cl :
                bid = self._bidList[i]
                bcls.append(bid)
            self._clusters.append(bcls)

    def SetClusters(self, clusters):
        assert len(clusters) == self._nClusters
        self._clusters = clusters
        
    def GetClusters(self) :
        return self._clusters

    def MapToClusterScores(self, fp) :
        """ Map the fingerprint to a real valued vector of score based on the bit clusters

        The dimension of the vector is same as the number of clusters. Each value in the 
        vector corresponds to the number of bits in the corresponding cluster
        that are turned on in the fingerprint

        ARGUMENTS:
         - fp : the fingerprint 
        """
    
        scores = [0]*self._nClusters

        i = 0
        for cls in self._clusters:
            for bid in cls :
                if fp[bid] :
                    scores[i] += 1

            i += 1

        return scores

    def MapToClusterFP(self, fp) :
        """ Map the fingerprint to a smaller sized (= number of clusters) fingerprint

        Each cluster get a bit in the new fingerprint and is turned on if any of the bits in
        the cluster are turned on in the original fingerprint"""

        ebv = DataStructs.ExplicitBitVect(self._nClusters)
        i = 0

        for cls in self._clusters:
            for bid in cls :
                if fp[bid] :
                    ebv.SetBit(i)
                    break
            i += 1
        
        return ebv