File: LigandFileSplitter.C

package info (click to toggle)
ball 1.5.0%2Bgit20180813.37fc53c-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 239,888 kB
  • sloc: cpp: 326,149; ansic: 4,208; python: 2,303; yacc: 1,778; lex: 1,099; xml: 958; sh: 322; makefile: 95
file content (375 lines) | stat: -rwxr-xr-x 11,515 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
// -*- Mode: C++; tab-width: 2; -*-
// vi: set ts=2:
//

#include <BALL/FORMAT/molFileFactory.h>
#include <BALL/FORMAT/genericMolFile.h>
#include <BALL/FORMAT/dockResultFile.h>
#include <BALL/FORMAT/commandlineParser.h>
#include "version.h"

using namespace BALL;
using namespace std;


void validateParameters(CommandlineParser& parpars)
{
  if (parpars.has("o"))
	{
    if (parpars.has("output_name_pattern"))
    {
      Log.level(10) << "\rNOTE: Parameter 'output_name_pattern' is ignored because filenames are specified explicitly." << endl;
    }
    if (parpars.has("no"))
    {
      Log.level(10) << "\rNOTE: Parameter 'no' is ignored because filenames are specified explicitly." << endl;
    }
    if (parpars.has("ligands_per_file"))
    {
      Log.level(10) << "\rNOTE: Parameter 'mpf' is ignored because filenames are specified explicitly." << endl;
    }
	}
  else
  {
    if (parpars.has("no"))
    {

      if (parpars.has("o"))
      {
        Log.level(10) << "\rNOTE: Parameter 'o' is ignored because the number of output files is specified explicitly." << endl;
      }
      if (parpars.has("ligands_per_file"))
      {
        Log.level(10) << "\rNOTE: Parameter 'mpf' is ignored because the number of output files is specified explicitly." << endl;
      }
    }
    else
    {
      if (parpars.has("mpf"))
      {      
        if (parpars.has("no"))
        {
          Log.level(10) << "\rNOTE: Parameter 'no' is ignored because the number of ligands per output file is specified explicitly." << endl;
        }
        if (parpars.has("o"))
        {
          Log.level(10) << "\rNOTE: Parameter 'o' is ignored because the number of ligands per output file is specified explicitly." << endl;
        }
      }
      else
      {
        // No parameter passed that can be used to determine how to split.
        // EXIT
        
        Log.error() << "\rERROR: No parameter passed to specify how to split." << endl;
        Log.error() << "\r       Please set either 'no', 'o' or 'mpf'." << endl;
        Log.error() << "\r       Exit without producing output files." << endl;

        exit(1);
      }
    }
  }
  

	if (parpars.has("outname_pattern"))
	{
    String outname_pattern = parpars.get("outname_pattern");  
		
    // count the number of '%d' and exit if the name is different to one
		string::size_type placeholder_1 = outname_pattern.find("%d");
		string::size_type placeholder_2 = outname_pattern.rfind("%d");
		
    // if there is more than one %d, it means that the index of the first and last occurrences are different
		// if there is not a single %d, then the index of both first and last occurrences must be npos
		if (placeholder_1 != placeholder_2 || placeholder_1 == String::npos)
		{
			Log.error() << "Error: The provided value for outname_pattern '" << outname_pattern << "' is invalid." << endl;
      Log.error() << "       Exit without producing output files." << endl;
			exit(1);
		}
	}
}


String getOutputFileName(String& outname_base, bool is_pattern, String& outfile_type, unsigned int index)
{
	String outfile_name = outname_base;
  
	if (is_pattern)
	{
		outfile_name.substituteAll("%d", String(index));
	}
	else
	{
		outfile_name = outfile_name + String("_") + String(index)  + "." + outfile_type;
	}
	
	return outfile_name;
}


int main(int argc, char* argv[])
{
	CommandlineParser parpars("LigandFileSplitter", "split molecule files", VERSION, String(__DATE__), "Preparation");
	parpars.registerMandatoryInputFile("i", "input molecule file");
	parpars.registerOptionalIntegerParameter("no", "Number of output files to be created");
	parpars.registerOptionalIntegerParameter("mpf", "Number of molecules per output file");
	parpars.registerOptionalStringParameter("outname_pattern", "Pattern that will be used to generate the names of the output files, see notes and examples below.");
	parpars.registerOptionalOutputFileList("o", "Output filenames. If none are specified, input filename postfixed with IDs will be used");
  
	String man =
	"LigandFileSplitter splits a molecule file into a given number of subsets.\n\n"

	"Examples:\n\n"

  "$ LigandFileSplitter -i Trypsin_actives.sdf -o batch_1 batch_2\n"
	"  will split the input file Trypsin_actives.sdf in the two output files batch_1.sdf and batch_2.sdf.\n\n"
	
  "$ LigandFileSplitter -i Trypsin_actives.sdf -no 3\n"
	"  will split the input file Trypsin_actives.sdf in three files named Trypsin_actives_0.sdf, Trypsin_actives_1.sdf and Trypsin_actives_2.sdf\n\n"
	
  "$ LigandFileSplitter -i ligands.sdf -ligands_per_file 4\n"
	"  will split the input file ligands.sdf in as many files needed to fit at most 4 ligands per file.\n"
	"  The files will be named ligands_0.sdf, ligands_1.sdf ... ligands_N.sdf\n\n"
	
  "$ LigandFileSplitter -i ligands.sdf -ligands_per_file 5 -outname_pattern split_ligands-%d.sdf\n"
	"  will split the input file ligands.sdf in as many files needed to fit at most 5 ligands per file.\n"
  "  The files will be named split_ligands-0.sdf, split_ligands-1.sdf, ... , split_ligands-N.sdf.\n\n"
			  
  "$ LigandFileSplitter -i ligands.sdf -outname_pattern split_ligands_%d.sdf -no 100\n"
	"  will split the input file ligands.sdf in 100 files using the following names:\n"
	"  split_ligands_0.sdf, split_ligands_1.sdf, ... , split_ligands_99.sdf.\n\n"
			
  "NOTES:\n"
	"- Molecules are not sorted in any way.\n"
	"- The tool is no format converter and the format of the output files will be the same as of the input file.\n"
	"- Output_name_pattern accepts a printf-like pattern, expecting exactly one decimal integer placeholder, %d.\n"
	"- The following are valid patterns: output_ligand.sdf_%d, split_%d.mol, %d_lig.drf\n"
	"- The following are invalid patterns: output_%f.sdf, ligands.drf_%u, %d_lig_%d.mol, molecules.sdf\n\n"
  
  "WARNING:\n"
	"- If the parameter outname_pattern is specified, the user is responsible for the occurrence of a valid file extension\n"
	"  in the outname_pattern, which has to be of the same file format as the input file.\n\n";
  
	parpars.setToolManual(man);
	parpars.setSupportedFormats("i","mol2,sdf,drf");
	parpars.setSupportedFormats("o","mol2,sdf,drf");
 	parpars.parse(argc, argv);
  
  
  // Check if parameter setting is valid and/or useful  
 	validateParameters(parpars);

  
  unsigned int n_molecules = 0;
  unsigned int n_outfiles = 0;
  unsigned int mpf = 0;
  
  String infile = parpars.get("i");
  String infile_name = infile.substr(0, infile.find_last_of('.'));
  String infile_type = infile.substr(infile.find_last_of('.') + 1, infile.length() - infile.find_last_of('.') - 1);
  
  vector<String> outfile_names;
  HashSet <String> conformation_ids;
  
  Molecule* mol;
 	GenericMolFile* input;
  GenericMolFile* output;

  DockResultFile* drf_input;
  DockResultFile* drf_output;  
  

	// Determine number of molecules in input files.
	// In case of DockResultFiles, we do not need to process all contained molecules 
	// in order to achieve this; we can simply count the result-section entries.
  
  input = MolFileFactory::open(infile);
	
  drf_input = NULL;
  drf_input = dynamic_cast<DockResultFile*>(input);
	if (drf_input)
	{
		n_molecules = drf_input->countConformations();
	}
	else
	{
 		Log.level(10) << "\rCount number of molecules in input file ..." << endl;
		for (mol = input->read(); mol; mol = input->read())
		{
			++n_molecules;
			delete mol;
		}
	}	
	
	Log.level(10) << "\r" << n_molecules << " molecules found." << endl << endl;

	input->close();
	delete input;

	
	// Check which split method should be applied
	
	if (parpars.has("o"))
	{
		// Option 1:
		// Number of output files specified by explicit name passing.
		// Parameter 'o' is specified
    
		list<String> tmp = parpars.getList("o");
    for (list<String>::iterator iter = tmp.begin(); iter != tmp.end(); ++iter)
    {
      outfile_names.push_back(*iter + "." + infile_type);
    }
    
		n_outfiles = outfile_names.size();
  
    if (n_molecules >= n_outfiles)
    {
      mpf = floor((double)n_molecules / n_outfiles);
    }
    else
    {
      Log.level(10) << "\rNOTE: Number of molecules in input file is smaller than number of specified output files." << endl;
      n_outfiles = n_molecules;
      mpf = 1;
    }
	}
  else
  {
    if (parpars.has("no"))
    {
      // Option 2:
		  // Number of output files is specified directly.
		  // Parameter 'no' is specified
      
      n_outfiles = parpars.get("no").toInt();
      
      if (n_molecules >= n_outfiles)
      {
        mpf = floor((double)n_molecules / n_outfiles);
      }
      else
      {
        Log.level(10) << "\rNOTE: Number of molecules in input file is smaller than specified number of output files." << endl;
        n_outfiles = n_molecules;
        mpf = 1;
      }
    }
    else
    {
      if (parpars.has("mpf"))
      {
        // Option 3:
        // Number of molecules per output file is specified directly.
        // Parameter 'mpf' is specified
      
        mpf = parpars.get("mpf").toInt();
        n_outfiles = ceil((double)n_molecules / (double)mpf);
      }
    }
    
    
    // Generate output file names
    
    if (parpars.has("outname_pattern"))
    {
      // Option 1: Generate output file names from specified pattern

      String pattern = parpars.get("outname_pattern");
      
      for (unsigned int i=0; i!= n_outfiles; ++i)
      {
        outfile_names.push_back(getOutputFileName(pattern, true, infile_type, i));
      }
    }
    else
    {
      // Option 2: Simple indexing of input file name
      
      for (unsigned int i=0; i!= n_outfiles; ++i)
      {
        outfile_names.push_back(getOutputFileName(infile_name, false, infile_type, i));
      }
    }
      
  }
  
  
  // Now do the splitting
  
  input = MolFileFactory::open(infile);

  drf_input = NULL;
	drf_input = dynamic_cast<DockResultFile*>(input);
	if (drf_input) 
  {
    drf_input->selectAllResultsForInput();
  }
  
  for (unsigned int i=0; i!=outfile_names.size(); ++i)
  {
    conformation_ids.clear();
    
    output = MolFileFactory::open(outfile_names[i], ios::out, infile_type);
    drf_output = dynamic_cast<DockResultFile*>(output);
    if (drf_input && drf_output)
    {
      drf_output->disableAutomaticResultCreation();
    }

    
    if (i < outfile_names.size()-1)
    {
      // Not the last file - so number of molecules is standard
      
      for (unsigned int j=0; j!=mpf; ++j)
      {
        mol = input->read();
        if (drf_input && drf_output && mol->hasProperty("Conformation_input_UID"))
        {
          conformation_ids.insert(mol->getProperty("Conformation_input_UID").toString());
        }

        output->write(*mol);
        delete mol;
      }
    }
    else
    {
      // Last output file - so write remaining molecules into it
      
      mol = input->read();
      while (mol)
      {
        if (drf_input && drf_output && mol->hasProperty("Conformation_input_UID"))
        {
          conformation_ids.insert(mol->getProperty("Conformation_input_UID").toString());
        }

        output->write(*mol);
        delete mol;
        mol = input->read();
      }
    }

    if (drf_input && drf_output)
    {
      const vector<Result*>* results = drf_input->getResults();
      for (unsigned int i = 0; i < results->size(); ++i)
      {
        Result* new_res = new Result(*(*results)[i], conformation_ids);
        drf_output->addResult(new_res);
      }
    }
    
    output->close();
    delete output;
  }  

  input->close();
  delete input;
  
  
  return 0;
}