File: InfoGainFuncs.h

package info (click to toggle)
rdkit 201203-3
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 37,840 kB
  • sloc: cpp: 93,902; python: 51,897; java: 5,192; ansic: 3,497; xml: 2,499; sql: 1,641; yacc: 1,518; lex: 1,076; makefile: 325; fortran: 183; sh: 153; cs: 51
file content (138 lines) | stat: -rw-r--r-- 3,318 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// $Id: InfoGainFuncs.h 1508 2010-09-15 04:18:59Z glandrum $
//
//  Copyright (C) 2003 Rational Discovery LLC
//

#ifndef INFOGAINFUNC_H
#define INFOGAINFUNC_H

#include <RDGeneral/types.h>

namespace RDInfoTheory {

  template<class T> double ChiSquare(T *dMat, long int dim1,long int dim2) {
    // For a contingency matrix with each column corresponding to a class and each row to a 
    // the descriptor (or variable) state, the matrix looks something like for 3x3 problem
    // 
    //            1    2    3   Totals
    //      1 |  N11  N12  N13    R1
    //      2 |  N21  N22  N23    R2
    //      3 |  N31  N32  N33    R3
    // Totals |   C1   C2   C3    N
    //
    //  Th chi squere formula is 
    //  chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
    T *rowSums, *colSums;
    int i, j, tSum;
    // find the row sum
    tSum = 0;
    rowSums = new T[dim1];
    for (i = 0; i < dim1; i++) {
      int idx1 = i*dim2;
      rowSums[i] = (T)0.0;
      for (j = 0; j < dim2; j++) {
        rowSums[i] += dMat[idx1 + j];
      }
      tSum += (int)rowSums[i];
    }

    // find the column sums
    colSums = new T[dim2];
    for (i = 0; i < dim2; i++) {
      colSums[i] = (T)0.0;
      for (j = 0; j < dim1; j++) {
        colSums[i] += dMat[j*dim2 + i];
      }
    }
    
    double chi = 0.0;
    for ( i = 0; i < dim1; i++) {
      double rchi = 0.0;
      for (j = 0; j < dim2; j++) {
        rchi += (pow((double)dMat[i*dim2 + j], 2)/colSums[j]);
      }
      chi += ( ((double)tSum/rowSums[i])*rchi );
    }
    chi -= tSum;
    delete [] rowSums;
    delete [] colSums;

    return chi;
  }

  template<class T> double InfoEntropy(T *tPtr, long int dim) {
    int i;
    T nInstances = 0;
    double accum=0.0,d;
    
    for(i=0;i<dim;i++){
      nInstances += tPtr[i];
    }
  
    if(nInstances != 0){
      for(i=0;i<dim;i++){
        d = (double)tPtr[i]/nInstances;
        if(d != 0){
          accum += -d*log(d);
        }
      }
    }
    return accum/log(2.0);
  }

  template<class T> double InfoEntropyGain(T *dMat, long int dim1,long int dim2) {
    T *variableRes, *overallRes;
    double gain,term2;
    int tSum;

    //std::cerr<<" --------\n    ieg: "<<dim1<<" "<<dim2<<std::endl;
    variableRes = new T[dim1];
    for(long int i=0;i<dim1;i++){
      long int idx1 = i*dim2;
      variableRes[i] = (T)0.0;
      for(long int j=0;j<dim2;j++){
        variableRes[i] += dMat[idx1+j];
        //std::cerr<<"  "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
      }
    }

    overallRes = new T[dim2];
    // do the col sums
    for(long int i=0;i<dim2;i++){
      overallRes[i] = (T)0.0;
      for(long int j=0;j<dim1;j++){
        overallRes[i] += dMat[j*dim2+i];
        //std::cerr<<"  "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
      }
    }

    term2 = 0.0;
    for(long int i=0;i<dim1;i++) {
      T *tPtr;
      tPtr = dMat + i*dim2;
      term2 += variableRes[i] * InfoEntropy(tPtr,dim2);
    }
    tSum = 0;
    for(long int i=0;i<dim2;i++){
      tSum += static_cast<int>(overallRes[i]);
    }
    
    if(tSum != 0){
      term2 /= tSum;
      gain = InfoEntropy(overallRes,dim2) - term2;
    }
    else{
      gain = 0.0;
    }
    //std::cerr<<"  >gain> "<<gain<<std::endl;
    
    delete [] overallRes;
    delete [] variableRes;
    return gain;
  }
   
  
}
#endif