File: InfoGainFuncs.h

package info (click to toggle)
rdkit 201809.1%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 123,688 kB
  • sloc: cpp: 230,509; python: 70,501; java: 6,329; ansic: 5,427; sql: 1,899; yacc: 1,739; lex: 1,243; makefile: 445; xml: 229; fortran: 183; sh: 123; cs: 93
file content (139 lines) | stat: -rw-r--r-- 3,163 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// $Id$
//
//  Copyright (C) 2003 Rational Discovery LLC
//

#include <RDGeneral/export.h>
#ifndef INFOGAINFUNC_H
#define INFOGAINFUNC_H

#include <RDGeneral/types.h>

namespace RDInfoTheory {

template <class T>
double ChiSquare(T *dMat, long int dim1, long int dim2) {
  // For a contingency matrix with each column corresponding to a class and each
  // row to a
  // the descriptor (or variable) state, the matrix looks something like for 3x3
  // problem
  //
  //            1    2    3   Totals
  //      1 |  N11  N12  N13    R1
  //      2 |  N21  N22  N23    R2
  //      3 |  N31  N32  N33    R3
  // Totals |   C1   C2   C3    N
  //
  //  Th chi squere formula is
  //  chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
  T *rowSums, *colSums;
  int i, j, tSum;
  // find the row sum
  tSum = 0;
  rowSums = new T[dim1];
  for (i = 0; i < dim1; i++) {
    int idx1 = i * dim2;
    rowSums[i] = (T)0.0;
    for (j = 0; j < dim2; j++) {
      rowSums[i] += dMat[idx1 + j];
    }
    tSum += (int)rowSums[i];
  }

  // find the column sums
  colSums = new T[dim2];
  for (i = 0; i < dim2; i++) {
    colSums[i] = (T)0.0;
    for (j = 0; j < dim1; j++) {
      colSums[i] += dMat[j * dim2 + i];
    }
  }

  double chi = 0.0;
  for (i = 0; i < dim1; i++) {
    double rchi = 0.0;
    for (j = 0; j < dim2; j++) {
      rchi += (pow((double)dMat[i * dim2 + j], 2) / colSums[j]);
    }
    chi += (((double)tSum / rowSums[i]) * rchi);
  }
  chi -= tSum;
  delete[] rowSums;
  delete[] colSums;

  return chi;
}

template <class T>
double InfoEntropy(T *tPtr, long int dim) {
  int i;
  T nInstances = 0;
  double accum = 0.0, d;

  for (i = 0; i < dim; i++) {
    nInstances += tPtr[i];
  }

  if (nInstances != 0) {
    for (i = 0; i < dim; i++) {
      d = (double)tPtr[i] / nInstances;
      if (d != 0) {
        accum += -d * log(d);
      }
    }
  }
  return accum / log(2.0);
}

template <class T>
double InfoEntropyGain(T *dMat, long int dim1, long int dim2) {
  T *variableRes, *overallRes;
  double gain, term2;
  int tSum;

  // std::cerr<<" --------\n    ieg: "<<dim1<<" "<<dim2<<std::endl;
  variableRes = new T[dim1];
  for (long int i = 0; i < dim1; i++) {
    long int idx1 = i * dim2;
    variableRes[i] = (T)0.0;
    for (long int j = 0; j < dim2; j++) {
      variableRes[i] += dMat[idx1 + j];
      // std::cerr<<"  "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
    }
  }

  overallRes = new T[dim2];
  // do the col sums
  for (long int i = 0; i < dim2; i++) {
    overallRes[i] = (T)0.0;
    for (long int j = 0; j < dim1; j++) {
      overallRes[i] += dMat[j * dim2 + i];
      // std::cerr<<"  "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
    }
  }

  term2 = 0.0;
  for (long int i = 0; i < dim1; i++) {
    T *tPtr;
    tPtr = dMat + i * dim2;
    term2 += variableRes[i] * InfoEntropy(tPtr, dim2);
  }
  tSum = 0;
  for (long int i = 0; i < dim2; i++) {
    tSum += static_cast<int>(overallRes[i]);
  }

  if (tSum != 0) {
    term2 /= tSum;
    gain = InfoEntropy(overallRes, dim2) - term2;
  } else {
    gain = 0.0;
  }
  // std::cerr<<"  >gain> "<<gain<<std::endl;

  delete[] overallRes;
  delete[] variableRes;
  return gain;
}
}
#endif