File: EvenSplit.C

package info (click to toggle)
ball 1.5.0%2Bgit20180813.37fc53c-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 239,888 kB
  • sloc: cpp: 326,149; ansic: 4,208; python: 2,303; yacc: 1,778; lex: 1,099; xml: 958; sh: 322; makefile: 95
file content (108 lines) | stat: -rw-r--r-- 3,212 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// -*- Mode: C++; tab-width: 2; -*-
// vi: set ts=2:
//

#include <BALL/FORMAT/molFileFactory.h>
#include <BALL/FORMAT/genericMolFile.h>
#include <BALL/FORMAT/commandlineParser.h>
#include <BALL/KERNEL/molecule.h>
#include "version.h"

using namespace BALL;
using namespace std;

int main(int argc, char* argv[])
{
	CommandlineParser parpars("EvenSplit", "generate splits w/ equal property range", VERSION, String(__DATE__), "Preparation");
	parpars.registerMandatoryInputFile("i", "input file");
	parpars.registerMandatoryOutputFile("o1", "output file 1");
	parpars.registerMandatoryOutputFile("o2", "output file 2");

	parpars.registerOptionalStringParameter("prop", "property name", "binding_free_energy");
	parpars.registerOptionalIntegerParameter("n", "max. number of compounds to use from input file");
	parpars.registerOptionalIntegerParameter("k", "extract each k'th compound to 2nd output file", 2);
	parpars.registerOptionalIntegerParameter("offset", "offset; extract each (i+offset)%k == 0 to 2nd output file");
	String man = "This tool splits a molecule file into two subsets in such a way that each of them convers an equal range of a property. The property with respect to which this is to be done should be specified with '-prop'.";
	parpars.setToolManual(man);
	parpars.setSupportedFormats("i","mol2,sdf,drf");
	parpars.setSupportedFormats("o1","mol2,sdf,drf");
	parpars.setSupportedFormats("o2","mol2,sdf,drf");

	parpars.parse(argc, argv);

	GenericMolFile* input = MolFileFactory::open(parpars.get("i"), ios::in);
	GenericMolFile* output1 = MolFileFactory::open(parpars.get("o1"), ios::out, input);
	GenericMolFile* output2 = MolFileFactory::open(parpars.get("o2"), ios::out, input);

	String s = parpars.get("offset");
	int offset = 0;
	if (s != CommandlineParser::NOT_FOUND) offset = s.toInt();

	bool use_max = 0;
	int max = 0;
	s = parpars.get("n");
	if (s != CommandlineParser::NOT_FOUND)
	{
		max = s.toInt();
		if (max > 0) use_max = 1;
	}

	int k = 2;
	s = parpars.get("k");
	if (s != CommandlineParser::NOT_FOUND) k = s.toInt();
	String property = parpars.get("prop");

	multimap<double, Molecule*> prop_to_id;
	Size missing_prop = 0;
	//Size no_written = 0;

	for (Molecule* mol = input->read(); mol; mol = input->read())
	{
		if (mol->hasProperty(property))
		{
			String p = mol->getProperty(property).toString();
			prop_to_id.insert(make_pair(p.toDouble(), mol));
		}
		else missing_prop++;
	}

	int i = 1;
	Size no_o1 = 0;
	Size no_o2 = 0;
	for (multimap < double, Molecule* > ::iterator it = prop_to_id.begin();
		it!=prop_to_id.end(); it++, i++)
	{
		if (use_max && i > max)
		{
			delete it->second;
			continue;
		}

		if ((i+offset)%k == 0)
		{
			*output2 << *it->second;
			no_o2++;
		}
		else
		{
			*output1 << *it->second;
			no_o1++;
		}
		delete it->second;
	}


	if (missing_prop > 0)
	{
		Log<<"[Warning:]"<<missing_prop<<" molecules in the input file did not contain the desired property and were ignored."<<endl;
	}

	Log<<"Wrote "<<no_o1<<" molecules to file '"<<parpars.get("o1")<<"' and "<<no_o2<<" molecules to file '"<<parpars.get("o2")<<"'."<<endl;

	input->close();
	output1->close();
	output2->close();
	delete input;
	delete output1;
	delete output2;
}