File: MolPredictor.C

package info (click to toggle)
ball 1.5.0%2Bgit20180813.37fc53c-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 239,888 kB
  • sloc: cpp: 326,149; ansic: 4,208; python: 2,303; yacc: 1,778; lex: 1,099; xml: 958; sh: 322; makefile: 95
file content (132 lines) | stat: -rwxr-xr-x 4,845 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// -*- Mode: C++; tab-width: 2; -*-
// vi: set ts=2:
//

#include <BALL/FORMAT/commandlineParser.h>
#include <BALL/QSAR/registry.h>
#include <BALL/QSAR/configIO.h>
#include <fstream>
#include "version.h"

using namespace BALL::QSAR;
using namespace BALL;
using namespace std;

int main(int argc, char* argv[])
{
	CommandlineParser par("MolPredictor","predict molecule activities with QSAR model", VERSION, String(__DATE__), "QuEasy (QSAR)");
	par.registerMandatoryInputFile("i","input sd-file");
	par.registerMandatoryInputFile("mod","file containing QSAR model");
	par.registerMandatoryOutputFile("o","output sd-file");
	par.registerOptionalInputFile("csv","input csv-file w/ additional descriptors");
	par.registerOptionalIntegerParameter("csv_nr","no. of response variables in csv-file");
	par.registerFlag("sdp","use sd-properties as additional descriptors");
	par.registerFlag("csv_cl","csv-file has compound (row) labels");
	par.registerFlag("csv_dl","csv-file has descriptor (column) labels");
	par.registerOptionalIntegerParameter("csv_sep","separator symbol in csv-file");
	par.registerFlag("rm", "remove input sd-file when finished");

	String man = "This tool predictes the response values of compounds in the given molecule file using the specified QSAR model.\n\nInput of this tool is a molecule file (sdf,mol2,drf) and a model-file as generated by ModelCreator or FeatureSelector.\nFeatures for all molecules in the input file are generated automatically. However, if you used an additional, externally generated feature-set to generate your QSAR model, make sure to generate features in the same manner (i.e. using the same external tool with the same settings) for the molecule file to be used here and specify the csv-file with the above options.\n\nOutput of this tool (as specified by '-o') is a molecule file containing the predicted values as a property tag named 'predicted_activity'.";
	par.setToolManual(man);
	par.setSupportedFormats("i","sdf");
	par.setSupportedFormats("mod","mod");
	par.setSupportedFormats("csv","csv");
	par.setSupportedFormats("o","sdf,txt");
	par.parse(argc,argv);

	string input = par.get("i");
	string model_file = par.get("mod");
	string output = par.get("o");

	String csv = "";
	String s = par.get("csv");
	if (s!=CommandlineParser::NOT_FOUND) csv = s;
	int csv_no_response = 0;
	s = par.get("csv_nr");
	if (s!=CommandlineParser::NOT_FOUND) csv_no_response = s.toInt();
	bool csv_desc_labels = 0;
	s = par.get("csv_dl");
	if (s!=CommandlineParser::NOT_FOUND) csv_desc_labels = s.toInt();
	bool csv_compound_labels = 0;
	s = par.get("csv_cl");
	if (s!=CommandlineParser::NOT_FOUND) csv_compound_labels = s.toInt();
	String csv_separator = "\t";
	s = par.get("csv_sep");
	if (s!=CommandlineParser::NOT_FOUND) csv_separator = s;
	bool read_sd_descriptors = 0;
	s = par.get("sdp");
	if(s!=CommandlineParser::NOT_FOUND) read_sd_descriptors = s.toBool();

	bool txt_output=0;
	if(output.size()>4 && output.substr(output.size()-4)==".txt")
	{
		txt_output=true;
	}

	/// read model from given file
	QSARData q;
	Model* m = createNewModelFromFile(model_file,q);


	/// read molecules and descriptors
	set<String> act;
	Log.level(5) << "Will now read input-file and generate features ... "; Log.flush();
	q.readSDFile(input.c_str(),act,read_sd_descriptors,0,0,1,1);
	Log.level(5) << "done. " << endl << flush;
	if(csv!="") q.readCSVFile(csv.c_str(),0,csv_desc_labels,csv_compound_labels,csv_separator.c_str(),csv_no_response);

	/// predict acitivity of each molecule in the SD-file
	vector<double> activities(q.getNoSubstances());
	Size total = q.getNoSubstances();
	for(unsigned int i=0;i<total;i++)
	{
		if(i%50==0)
		{
			Log.level(5) << "\tPredicting activities of compounds: " << (i+1) << "/" << total;
			Log.flush();
		}
		vector<double>* v = q.getSubstance(i); // get UNcentered descriptor-vector of test compound
		Eigen::VectorXd res = m->predict(*v,1); // transform val. data according to centering of training data
		delete v;
		activities[i] = res[0];
	}
	Log.level(5) << "\tPredicting activities of compounds: " << total << "/" << total << endl;


	/// write molecules with appended activity-labels to output sd-file
	if(!txt_output)
	{
		SDFile sd_in(input);
		SDFile sd_out(output,ios::out);
		Molecule* mol;
		uint no_pos=0;
		for(uint i=0; (mol=sd_in.read()); i++)
		{
			if(i%50==0)
			{
				Log.level(5) << "\tSaving compounds: " << (i+1) << "/" << total; Log.flush();
			}
			mol->setProperty("predicted_activity",activities[i]);
			no_pos++;
			sd_out << *mol;

			delete mol;
		}
		Log.level(5) << "\tSaving compounds: " << total << "/" << total << endl;
	}
	else
	{
		ofstream out(output.c_str());
		for(unsigned int i=0; i<activities.size(); i++)
		{
			out<<"mol_"<<i<<"\t"<<activities[i]<<endl;
		}
	}

	delete m;

	if (par.has("rm"))
	{
		File::remove(par.get("i"));
	}
}