File: classificationModel.C

package info (click to toggle)
ball 1.5.0%2Bgit20180813.37fc53c-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 239,888 kB
  • sloc: cpp: 326,149; ansic: 4,208; python: 2,303; yacc: 1,778; lex: 1,099; xml: 958; sh: 322; makefile: 95
file content (162 lines) | stat: -rw-r--r-- 4,754 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
// -*- Mode: C++; tab-width: 2; -*-
// vi: set ts=2:
//
// 

#include <BALL/QSAR/classificationModel.h>

namespace BALL
{
	namespace QSAR
	{
				
		ClassificationModel::ClassificationModel(const QSARData& q) : Model(q) 
		{
			validation = new ClassificationValidation(this); 
			model_val = validation;
		}


		ClassificationModel::~ClassificationModel()
		{
			delete validation;
		}

		void ClassificationModel::operator = (ClassificationModel& m)
		{
			Model::operator = (m);
		}

		void ClassificationModel::readLabels()
		{
			std::multiset<int> l;
			for (int i = 0; i < Y_.rows(); i++)
			{
				for (int j = 0; j < Y_.cols(); j++)
				{
					int label = static_cast<int>(Y_(i, j));
					if (label != Y_(i, j))
					{
						throw Exception::WrongDataType(__FILE__, __LINE__, "Some class labels are not discrete values!! Creation of a classification model is therefore not possible!"); 
					}
					if (l.find(label) == l.end())  // for classification experiments, Y will contain only ints
					{
						l.insert(label);
					}
				}
			}
			
			labels_.clear();
			std::multiset<int>::iterator l_it = l.begin();
			for (; l_it != l.end(); ++l_it)
			{
				labels_.push_back(*l_it);
			}
		}

		vector < int > ClassificationModel::getClassLabels()
		{
			return labels_;	
		}

		void ClassificationModel::readClassInformationFromFile(std::ifstream& input, int no_classes)
		{
			labels_.clear();
			no_substances_.clear();
			
			String line;
			getline(input, line);  // skip comment line 
			getline(input, line);    
			for (int i = 0; i < no_classes; i++)
			{
				labels_.push_back(line.getField(i, "\t").toInt());
			}	
			getline(input, line);  // skip empty line
			getline(input, line);  // skip comment line 
			getline(input, line);
			for (int i = 0; i < no_classes; i++)
			{
				int n = line.getField(i, "\t").toInt();
				no_substances_.push_back(n);
			}
			getline(input, line);  // skip empty line 
		}


		void ClassificationModel::saveClassInformationToFile(std::ofstream& out)
		{
			out<<"# class-labels_\n";
			for (unsigned int i = 0; i < labels_.size(); i++) // write class-labels_
			{
				out<<labels_[i]<<"\t";
			}
			out<<std::endl<<std::endl;
			
			out<<"# no of substances of each class\n";
			for (unsigned int i = 0; i < no_substances_.size(); i++)  // write numbers of substances of each class
			{
				out<<no_substances_[i]<<"\t";
			}
			out<<std::endl<<std::endl;
		}


		void ClassificationModel::equalSpaceDiscretization(unsigned int bins, Eigen::MatrixXd & discretization_information)
		{
			unsigned int no_features = descriptor_matrix_.cols();
			unsigned int no_compounds = descriptor_matrix_.rows();
			
			discretization_information.resize(2, no_features); 
			discretization_information.row(0).fill( 1e10); // minimum of each feature in first row
			discretization_information.row(1).fill(-1e10); // maximum of each feature in second row
			
			// find minimum and maximum of each feature
			for (unsigned int i = 0; i < no_features; i++)
			{
				for (unsigned int j = 0; j < no_compounds; j++)
				{
					if (descriptor_matrix_(j, i) < discretization_information(0, i))
					{
						discretization_information(0, i) = descriptor_matrix_(j, i); 
					}
					if (descriptor_matrix_(j, i) > discretization_information(1, i))
					{
						discretization_information(1, i) = descriptor_matrix_(j, i); 
					}
				}
			}

			// transform each feature value to a discrete value
			for (unsigned int i = 0; i < no_features; i++)
			{
				double step_width = (discretization_information(1, i)-discretization_information(0, i)) / bins; 
				for (unsigned int j = 0; j < no_compounds; j++)
				{
					unsigned int feat_bucket = (unsigned int)((descriptor_matrix_(j, i)-discretization_information(0, i)) / step_width); 
					if (feat_bucket >= bins) feat_bucket = bins-1; // for max.
					descriptor_matrix_(j, i) = feat_bucket;
				}	
			}
		}


		void ClassificationModel::equalSpaceDiscretizationTestData(Eigen::VectorXd & compound, unsigned int bins, const Eigen::MatrixXd & discretization_information)
		{
			if (compound.rows() != discretization_information.cols())
			{
				throw BALL::Exception::GeneralException(__FILE__, __LINE__, "Discretization error", "no of features of test compound and of discretized training data are different!"); 
			}
			
			unsigned int no_features = compound.rows();

			for (unsigned int i = 0; i < no_features; i++)
			{
				double step_width = (discretization_information(1, i)-discretization_information(0, i))/bins; 
				int feat_bucket = (int)((compound(i)-discretization_information(0, i))/step_width); 
				if (feat_bucket < 1) feat_bucket = 0; 
				else if ((unsigned int)feat_bucket >= bins) feat_bucket = bins-1; 
				compound(i) = feat_bucket;
			}
		}
	}
}