File: fastaselect.cpp

package info (click to toggle)
dnaclust 3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 260 kB
  • ctags: 259
  • sloc: cpp: 2,098; sh: 506; makefile: 46
file content (115 lines) | stat: -rw-r--r-- 4,096 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#include <iostream>
#include <cstdlib>
#include <fstream>
#include <string>
#include <map>
#include <algorithm>
#include "fasta.hpp"
#include <boost/program_options.hpp>		
namespace po = boost::program_options;

template <class InputIterator>
std::string firstWord(InputIterator first, InputIterator last)
{
  return std::string(first, std::find_if(first, last, isspace));
}

int main(int argc, char *argv[])
{

  bool produce_help;
  bool cluster_centers;
  bool all_clusters;
  bool everything_except;
  bool cluster_sequences;
  std::string fasta_file_name;
  std::string clusters_file_name_prefix;
  const std::string FASTA_SUFFIX = "fasta";

  po::options_description desc("Options");
  desc.add_options()
    ("help,h", po::bool_switch(&produce_help), "Produce help message.")
    ("fasta-file,f", po::value<std::string>(&fasta_file_name), "REQUIRED - File containing sequences in FASTA format.")
    ("cluster-centers,c", po::bool_switch(&cluster_centers), "Write the sequences of all cluster centers to standard output in FASTA format.")
    ("all-clusters,a", po::bool_switch(&all_clusters), "Write the sequences of each cluster to a seperate FASTA file. The name of the files will be the given path and prefix, folloed by cluster number andy a \'.fasta\' suffix.")
    ("file-name-prefix,p", po::value<std::string>(&clusters_file_name_prefix)->default_value("./cluster-"), "Specify the path and prefix for the cluster FASTA file names.")
    ("everything-except", po::bool_switch(&everything_except), "Write all sequences except cluster centers to the standard output in FASTA format.")
    ("cluster-sequences", po::bool_switch(&cluster_sequences), "Write the cluster sequences (but not the cluster center sequence) to the standard output in FASTA format.")
    ;

  po::variables_map vm;
  po::store(po::command_line_parser(argc, argv).options(desc).run(), vm);
  notify(vm);

  if (produce_help || fasta_file_name.empty()) {
    std::cerr << "Usage: fastaselect FastaFile\n"
	      << " The ids are read from standard input. Each line corresponds to a cluster with the first id being the cluster center.\n"
	      << " The output sequences are written to standard output in FASTA format.\n"
	      << desc << '\n';
    exit(EXIT_FAILURE);
  }


  std::ifstream fasta_file(fasta_file_name.c_str());
  sequence::Fasta sequences;
  fasta_file >> sequences;

  typedef std::map<std::string, size_t> StringIndexMap;

  StringIndexMap index_of_id;

  for (size_t i = 0; i < sequences.size(); ++i) {
    const std::string &header = sequences[i].header;
    index_of_id[firstWord(header.begin(), header.end())] = i;
  }


  if (cluster_centers) {
    std::string line;
    while (getline(std::cin, line)) {
      std::string center_id = firstWord(line.begin(), line.end());
      std::cout << sequences[index_of_id[center_id]];
    }

  } else if (everything_except) {
    std::set<std::string> center_ids;
    {
      std::string line;
      while (getline(std::cin, line)) 
	center_ids.insert(firstWord(line.begin(), line.end()));
    }
    for (size_t i = 0; i < sequences.size(); ++i) {
      const std::string &header = sequences[i].header;
      std::string id = firstWord(header.begin(), header.end());
      if (center_ids.find(id) == center_ids.end())
	std::cout << sequences[i];
    }
  } else if (cluster_sequences) {
    std::string line;
    while (getline(std::cin, line)) {
      std::istringstream line_stream(line);
      std::string id;
      line_stream >> id;
      while (line_stream >> id)
	std::cout << sequences[index_of_id[id]];
    }
  }

  if (all_clusters) {
    std::string line;
    int cluster_number = 0;
    while (getline(std::cin, line)) {
      std::ostringstream cluster_file_name_stream;
      cluster_file_name_stream << clusters_file_name_prefix << ++cluster_number << '.' << FASTA_SUFFIX;
      std::ofstream cluster_file(cluster_file_name_stream.str().c_str());

      std::istringstream line_stream(line);
      std::string id;
      while (line_stream >> id)
	cluster_file << sequences[index_of_id[id]];
      
    }
  }

  return EXIT_SUCCESS;
}