1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
|
#include <iostream>
#include <cstdlib>
#include <fstream>
#include <string>
#include <map>
#include <algorithm>
#include "fasta.hpp"
#include <boost/program_options.hpp>
namespace po = boost::program_options;
template <class InputIterator>
std::string firstWord(InputIterator first, InputIterator last)
{
return std::string(first, std::find_if(first, last, isspace));
}
int main(int argc, char *argv[])
{
bool produce_help;
bool cluster_centers;
bool all_clusters;
bool everything_except;
bool cluster_sequences;
std::string fasta_file_name;
std::string clusters_file_name_prefix;
const std::string FASTA_SUFFIX = "fasta";
po::options_description desc("Options");
desc.add_options()
("help,h", po::bool_switch(&produce_help), "Produce help message.")
("fasta-file,f", po::value<std::string>(&fasta_file_name), "REQUIRED - File containing sequences in FASTA format.")
("cluster-centers,c", po::bool_switch(&cluster_centers), "Write the sequences of all cluster centers to standard output in FASTA format.")
("all-clusters,a", po::bool_switch(&all_clusters), "Write the sequences of each cluster to a seperate FASTA file. The name of the files will be the given path and prefix, folloed by cluster number andy a \'.fasta\' suffix.")
("file-name-prefix,p", po::value<std::string>(&clusters_file_name_prefix)->default_value("./cluster-"), "Specify the path and prefix for the cluster FASTA file names.")
("everything-except", po::bool_switch(&everything_except), "Write all sequences except cluster centers to the standard output in FASTA format.")
("cluster-sequences", po::bool_switch(&cluster_sequences), "Write the cluster sequences (but not the cluster center sequence) to the standard output in FASTA format.")
;
po::variables_map vm;
po::store(po::command_line_parser(argc, argv).options(desc).run(), vm);
notify(vm);
if (produce_help || fasta_file_name.empty()) {
std::cerr << "Usage: fastaselect FastaFile\n"
<< " The ids are read from standard input. Each line corresponds to a cluster with the first id being the cluster center.\n"
<< " The output sequences are written to standard output in FASTA format.\n"
<< desc << '\n';
exit(EXIT_FAILURE);
}
std::ifstream fasta_file(fasta_file_name.c_str());
sequence::Fasta sequences;
fasta_file >> sequences;
typedef std::map<std::string, size_t> StringIndexMap;
StringIndexMap index_of_id;
for (size_t i = 0; i < sequences.size(); ++i) {
const std::string &header = sequences[i].header;
index_of_id[firstWord(header.begin(), header.end())] = i;
}
if (cluster_centers) {
std::string line;
while (getline(std::cin, line)) {
std::string center_id = firstWord(line.begin(), line.end());
std::cout << sequences[index_of_id[center_id]];
}
} else if (everything_except) {
std::set<std::string> center_ids;
{
std::string line;
while (getline(std::cin, line))
center_ids.insert(firstWord(line.begin(), line.end()));
}
for (size_t i = 0; i < sequences.size(); ++i) {
const std::string &header = sequences[i].header;
std::string id = firstWord(header.begin(), header.end());
if (center_ids.find(id) == center_ids.end())
std::cout << sequences[i];
}
} else if (cluster_sequences) {
std::string line;
while (getline(std::cin, line)) {
std::istringstream line_stream(line);
std::string id;
line_stream >> id;
while (line_stream >> id)
std::cout << sequences[index_of_id[id]];
}
}
if (all_clusters) {
std::string line;
int cluster_number = 0;
while (getline(std::cin, line)) {
std::ostringstream cluster_file_name_stream;
cluster_file_name_stream << clusters_file_name_prefix << ++cluster_number << '.' << FASTA_SUFFIX;
std::ofstream cluster_file(cluster_file_name_stream.str().c_str());
std::istringstream line_stream(line);
std::string id;
while (line_stream >> id)
cluster_file << sequences[index_of_id[id]];
}
}
return EXIT_SUCCESS;
}
|