1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
|
// $Id$
//
// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/export.h>
#ifndef _RD_INFORANKER_H_
#define _RD_INFORANKER_H_
#include <RDGeneral/types.h>
#include <DataStructs/BitVects.h>
#include <iostream>
/*! \brief Class used to rank bits based on a specified measure of infomation
*
* Basically a primitive mimic of the CombiChem "signal" functionality
* To use:
* - create an instance of this class
* - loop over the fingerprints in the dataset by calling accumulateVotes
*method
* - call getTopN to get the top n ranked bits
*
* Sample usage and results from the python wrapper:
* Here's a small set of vectors:
* >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
* ...
* 0001 0
* 0101 0
* 0010 1
* 1110 1
*
* Default ranker, using infogain:
* >>> ranker = InfoBitRanker(4,2)
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
*int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* 3 1.000 2 0
* 2 1.000 0 2
* 0 0.311 0 1
*
* Using the biased infogain:
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
* >>> ranker.SetBiasList((1,))
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
*int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* 2 1.000 0 2
* 0 0.311 0 1
* 1 0.000 1 1
*
* A chi squared ranker is also available:
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
*int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* 3 4.000 2 0
* 2 4.000 0 2
* 0 1.333 0 1
*
* As is a biased chi squared:
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
* >>> ranker.SetBiasList((1,))
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
*int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* 2 4.000 0 2
* 0 1.333 0 1
* 1 0.000 1 1
*/
namespace RDInfoTheory {
typedef std::vector<RDKit::USHORT> USHORT_VECT;
typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
class RDKIT_INFOTHEORY_EXPORT InfoBitRanker {
public:
/*! \brief the type of measure for information
*
*/
typedef enum {
ENTROPY = 1,
BIASENTROPY = 2,
CHISQUARE = 3,
BIASCHISQUARE = 4
} InfoType;
/*! \brief Constructor
*
* ARGUMENTS:
*
* - nBits: the dimension of the bit vectors or the fingerprint length
* - nClasses: the number of classes used in the classification problem
*(e.g. active,
* moderately active, inactive etc.). It is assumed that the
*classes are
* numbered from 0 to (nClasses - 1)
* - infoType: the type of information metric
*/
InfoBitRanker(unsigned int nBits, unsigned int nClasses,
InfoType infoType = InfoBitRanker::ENTROPY)
: d_dims(nBits), d_classes(nClasses), d_type(infoType) {
d_counts.resize(0);
for (unsigned int i = 0; i < nClasses; i++) {
USHORT_VECT cCount;
cCount.resize(d_dims, 0);
d_counts.push_back(cCount);
}
d_clsCount.resize(d_classes, 0);
d_nInst = 0;
d_top = 0;
dp_topBits = 0;
d_biasList.resize(0);
dp_maskBits = 0;
}
~InfoBitRanker() {
if (dp_topBits) delete[] dp_topBits;
if (dp_maskBits) delete dp_maskBits;
}
/*! \brief Accumulate the votes for all the bits turned on in a bit vector
*
* ARGUMENTS:
*
* - bv : bit vector that supports [] operator
* - label : the class label for the bit vector. It is assumed that 0 <=
*class < nClasses
*/
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
void accumulateVotes(const SparseBitVect &bv, unsigned int label);
/*! \brief Returns the top n bits ranked by the information metric
*
* This is actually the function where most of the work of ranking is
*happening
*
* \param num the number of top ranked bits that are required
*
* \return a pointer to an information array. The client should *not*
* delete this
*/
double *getTopN(unsigned int num);
/*! \brief return the number of labelled instances(examples) or fingerprints
*seen so far
*
*/
unsigned int getNumInstances() const { return d_nInst; }
/*! \brief return the number of classes
*
*/
unsigned int getNumClasses() const { return d_classes; }
/*! \brief Set the classes to which the entropy calculation should be biased
*
* This list contains a set of class ids used when in the BIASENTROPY mode of
*ranking bits.
* In this mode, a bit must be correllated higher with one of the biased
*classes than all the
* other classes. For example, in a two class problem with actives and
*inactives, the fraction of
* actives that hit the bit has to be greater than the fraction of inactives
*that hit the bit
*
* ARGUMENTS:
* classList - list of class ids that we want a bias towards
*/
void setBiasList(RDKit::INT_VECT &classList);
/*! \brief Set the bits to be used as a mask
*
* If this function is called, only the bits which are present in the
* maskBits list will be used.
*
* ARGUMENTS:
* maskBits - the bits to be considered
*/
void setMaskBits(RDKit::INT_VECT &maskBits);
/*! \brief Write the top N bits to a stream
*
*/
void writeTopBitsToStream(std::ostream *outStream) const;
/*! \brief Write the top bits to a file
*
*/
void writeTopBitsToFile(const std::string &fileName) const;
private:
/*! \brief check if we want to compute the info content for a bit based on the
*bias list
*
* This what happens here:
* - the fraction of items in each class that hit a particular bit are
*computed
* - the maximum of these fractions for classes that are not in the
*biasList are computed
* - If this maximum is less than the fraction for atleast one of classes
*in the biaslist
* the bit is considered good
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
*of classes))
* a 2D structure is assumed with the first row containing number
*of items of each class
* with the bit set and the second row to entires of each class
*with the bit turned off
*/
bool BiasCheckBit(RDKit::USHORT *resMat) const;
/*! \brief Compute the biased info entropy gain based on the bias list
*
* This what happens here:
* - we call BiasCheckBit to see if the bit qualifies to compute the
*infocontent
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
*
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
*of classes))
* a 2D structure is assumed with the first row containing number
*of items of each class
* with the bit set and the second row to entires of each class
*with the bit turned off
*/
double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
/*! \brief Compute the biased chi qsure value based on the bias list
*
* This what happens here:
* - we call BiasCheckBit to see if the bit qualifies to compute the
*infocontent
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
*
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
*of classes))
* a 2D structure is assumed with the first row containing number
*of items of each class
* with the bit set and the second row to entires of each class
*with the bit turned off
*/
double BiasChiSquareGain(RDKit::USHORT *resMat) const;
unsigned int d_dims; // the number of bits in the fingerprints
unsigned int d_classes; // the number of classes (active, inactive,
// moderately active etc.)
InfoType d_type; // the type of information meassure - currently we support
// only entropy
VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for
// each bit for each class
USHORT_VECT d_clsCount; // counter for the number of instances of each class
double *dp_topBits; // storage for the top ranked bits and the corresponding
// statistics
unsigned int d_top; // the number of bits that have been ranked
unsigned int d_nInst; // total number of instances or fingerprints used
// accumulate votes
RDKit::INT_VECT
d_biasList; // if we want a bias towards certain classes in ranking bits
ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
};
}
#endif
|