File: InfoBitRanker.h

package info (click to toggle)
rdkit 201809.1%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 123,688 kB
  • sloc: cpp: 230,509; python: 70,501; java: 6,329; ansic: 5,427; sql: 1,899; yacc: 1,739; lex: 1,243; makefile: 445; xml: 229; fortran: 183; sh: 123; cs: 93
file content (277 lines) | stat: -rw-r--r-- 9,145 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
// $Id$
//
//  Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//

#include <RDGeneral/export.h>
#ifndef _RD_INFORANKER_H_
#define _RD_INFORANKER_H_

#include <RDGeneral/types.h>
#include <DataStructs/BitVects.h>
#include <iostream>

/*! \brief Class used to rank bits based on a specified measure of infomation
 *
 * Basically a primitive mimic of the CombiChem "signal" functionality
 * To use:
 *  - create an instance of this class
 *  - loop over the fingerprints in the dataset by calling accumulateVotes
 *method
 *  - call getTopN to get the top n ranked bits
 *
 * Sample usage and results from the python wrapper:
 * Here's a small set of vectors:
 * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
 * ...
 * 0001 0
 * 0101 0
 * 0010 1
 * 1110 1
 *
 * Default ranker, using infogain:
 * >>> ranker = InfoBitRanker(4,2)
 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
 * ...
 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
 *int(bit),'%.3f'%gain,int(n0),int(n1)
 * ...
 * 3 1.000 2 0
 * 2 1.000 0 2
 * 0 0.311 0 1
 *
 * Using the biased infogain:
 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
 * >>> ranker.SetBiasList((1,))
 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
 * ...
 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
 *int(bit),'%.3f'%gain,int(n0),int(n1)
 * ...
 * 2 1.000 0 2
 * 0 0.311 0 1
 * 1 0.000 1 1
 *
 * A chi squared ranker is also available:
 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
 * ...
 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
 *int(bit),'%.3f'%gain,int(n0),int(n1)
 * ...
 * 3 4.000 2 0
 * 2 4.000 0 2
 * 0 1.333 0 1
 *
 * As is a biased chi squared:
 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
 * >>> ranker.SetBiasList((1,))
 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
 * ...
 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
 *int(bit),'%.3f'%gain,int(n0),int(n1)
 * ...
 * 2 4.000 0 2
 * 0 1.333 0 1
 * 1 0.000 1 1
 */
namespace RDInfoTheory {
typedef std::vector<RDKit::USHORT> USHORT_VECT;
typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;

class RDKIT_INFOTHEORY_EXPORT InfoBitRanker {
 public:
  /*! \brief the type of measure for information
   *
   */
  typedef enum {
    ENTROPY = 1,
    BIASENTROPY = 2,
    CHISQUARE = 3,
    BIASCHISQUARE = 4
  } InfoType;

  /*! \brief Constructor
   *
   * ARGUMENTS:
   *
   *   - nBits: the dimension of the bit vectors or the fingerprint length
   *   - nClasses: the number of classes used in the classification problem
   *(e.g. active,
   *              moderately active, inactive etc.). It is assumed that the
   *classes are
   *              numbered from 0 to (nClasses - 1)
   *   - infoType: the type of information metric
   */
  InfoBitRanker(unsigned int nBits, unsigned int nClasses,
                InfoType infoType = InfoBitRanker::ENTROPY)
      : d_dims(nBits), d_classes(nClasses), d_type(infoType) {
    d_counts.resize(0);
    for (unsigned int i = 0; i < nClasses; i++) {
      USHORT_VECT cCount;
      cCount.resize(d_dims, 0);
      d_counts.push_back(cCount);
    }
    d_clsCount.resize(d_classes, 0);
    d_nInst = 0;
    d_top = 0;
    dp_topBits = 0;
    d_biasList.resize(0);
    dp_maskBits = 0;
  }

  ~InfoBitRanker() {
    if (dp_topBits) delete[] dp_topBits;
    if (dp_maskBits) delete dp_maskBits;
  }

  /*! \brief Accumulate the votes for all the bits turned on in a bit vector
   *
   *  ARGUMENTS:
   *
   *   - bv : bit vector that supports [] operator
   *   - label : the class label for the bit vector. It is assumed that 0 <=
   *class < nClasses
   */
  void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
  void accumulateVotes(const SparseBitVect &bv, unsigned int label);

  /*! \brief Returns the top n bits ranked by the information metric
   *
   * This is actually the function where most of the work of ranking is
   *happening
   *
   *  \param num the number of top ranked bits that are required
   *
   *  \return a pointer to an information array. The client should *not*
   *          delete this
   */
  double *getTopN(unsigned int num);

  /*! \brief return the number of labelled instances(examples) or fingerprints
   *seen so far
   *
   */
  unsigned int getNumInstances() const { return d_nInst; }

  /*! \brief return the number of classes
   *
   */
  unsigned int getNumClasses() const { return d_classes; }

  /*! \brief Set the classes to which the entropy calculation should be biased
   *
   * This list contains a set of class ids used when in the BIASENTROPY mode of
   *ranking bits.
   * In this mode, a bit must be correllated higher with one of the biased
   *classes than all the
   * other classes. For example, in a two class problem with actives and
   *inactives, the fraction of
   * actives that hit the bit has to be greater than the fraction of inactives
   *that hit the bit
   *
   * ARGUMENTS:
   *   classList - list of class ids that we want a bias towards
   */
  void setBiasList(RDKit::INT_VECT &classList);

  /*! \brief Set the bits to be used as a mask
   *
   * If this function is called, only the bits which are present in the
   *   maskBits list will be used.
   *
   * ARGUMENTS:
   *   maskBits - the bits to be considered
   */
  void setMaskBits(RDKit::INT_VECT &maskBits);

  /*! \brief Write the top N bits to a stream
   *
   */
  void writeTopBitsToStream(std::ostream *outStream) const;

  /*! \brief Write the top bits to a file
   *
   */
  void writeTopBitsToFile(const std::string &fileName) const;

 private:
  /*! \brief check if we want to compute the info content for a bit based on the
   *bias list
   *
   * This what happens here:
   *    - the fraction of items in each class that hit a particular bit are
   *computed
   *    - the maximum of these fractions for classes that are not in the
   *biasList are computed
   *    - If this maximum is less than the fraction for atleast one of classes
   *in the biaslist
   *      the bit is considered good
   * ARGUMENTS:
   *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num
   *of classes))
   *              a 2D structure is assumed with the first row containing number
   *of items of each class
   *              with the bit set and the second row to entires of each class
   *with the bit turned off
   */
  bool BiasCheckBit(RDKit::USHORT *resMat) const;

  /*! \brief Compute the biased info entropy gain based on the bias list
   *
   *  This what happens here:
   *    - we call BiasCheckBit to see if the bit qualifies to compute the
   *infocontent
   *    - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
   *
   * ARGUMENTS:
   *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num
   *of classes))
   *              a 2D structure is assumed with the first row containing number
   *of items of each class
   *              with the bit set and the second row to entires of each class
   *with the bit turned off
   */
  double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;

  /*! \brief Compute the biased chi qsure value based on the bias list
   *
   *  This what happens here:
   *    - we call BiasCheckBit to see if the bit qualifies to compute the
   *infocontent
   *    - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
   *
   * ARGUMENTS:
   *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num
   *of classes))
   *              a 2D structure is assumed with the first row containing number
   *of items of each class
   *              with the bit set and the second row to entires of each class
   *with the bit turned off
   */
  double BiasChiSquareGain(RDKit::USHORT *resMat) const;

  unsigned int d_dims;     // the number of bits in the fingerprints
  unsigned int d_classes;  // the number of classes (active, inactive,
                           // moderately active etc.)
  InfoType d_type;  // the type of information meassure - currently we support
                    // only entropy
  VECT_USHORT_VECT d_counts;  // place holder of counting the number of hits for
                              // each bit for each class
  USHORT_VECT d_clsCount;  // counter for the number of instances of each class
  double *dp_topBits;  // storage for the top ranked bits and the corresponding
                       // statistics
  unsigned int d_top;  // the number of bits that have been ranked
  unsigned int d_nInst;  // total number of instances or fingerprints used
                         // accumulate votes
  RDKit::INT_VECT
      d_biasList;  // if we want a bias towards certain classes in ranking bits
  ExplicitBitVect *dp_maskBits;  // allows only certain bits to be considered
};
}
#endif