1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
/*
* variant_file_filters.cpp
*
* Author: amarcketta
*/
#include "variant_file.h"
void variant_file::apply_filters(const parameters ¶ms)
{
filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_file, params.indv_exclude_file);
filter_individuals_randomly(params.max_N_indv);
}
void variant_file::filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude)
{
// Filter individuals by user provided lists
if (keep_then_exclude)
{
filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename);
filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename);
}
else
{
filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename);
filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename);
}
}
void variant_file::filter_individuals_by_keep_list(const set<string> &indv_to_keep, const string &indv_to_keep_filename)
{
// Filter individuals by user provided list
if ((indv_to_keep_filename == "") && (indv_to_keep.size() == 0))
return;
LOG.printLOG("Keeping individuals in 'keep' list\n");
set<string> indv_to_keep_copy = indv_to_keep;
if (indv_to_keep_filename != "")
{
ifstream infile(indv_to_keep_filename.c_str());
if (!infile.is_open())
LOG.error("Could not open Individual file:" + indv_to_keep_filename, 1);
string line;
string tmp_indv;
stringstream ss;
while (!infile.eof())
{
getline(infile, line);
ss.str(line);
ss >> tmp_indv;
indv_to_keep_copy.insert(tmp_indv);
ss.clear();
}
infile.close();
}
for (unsigned int ui=0; ui<include_indv.size(); ui++)
{
if (include_indv[ui] == false)
continue;
if (indv_to_keep_copy.find(meta_data.indv[ui]) == indv_to_keep_copy.end())
include_indv[ui] = false;
}
}
void variant_file::filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const string &indv_to_exclude_filename)
{
// Filter individuals by user provided list
if ((indv_to_exclude_filename == "") && (indv_to_exclude.size() == 0))
return;
LOG.printLOG("Excluding individuals in 'exclude' list\n");
set<string> indv_to_exclude_copy = indv_to_exclude;
if (indv_to_exclude_filename != "")
{
ifstream infile(indv_to_exclude_filename.c_str());
if (!infile.is_open())
{
LOG.error("Could not open Individual file:" + indv_to_exclude_filename, 1);
}
string line;
string tmp_indv;
stringstream ss;
while (!infile.eof())
{
getline(infile, line);
ss.str(line);
ss >> tmp_indv;
indv_to_exclude_copy.insert(tmp_indv);
ss.clear();
}
infile.close();
}
for (unsigned int ui=0; ui<include_indv.size(); ui++)
{
if (include_indv[ui] == false)
continue;
if (indv_to_exclude_copy.find(meta_data.indv[ui]) != indv_to_exclude_copy.end())
include_indv[ui] = false;
}
}
void variant_file::filter_individuals_randomly(int max_N_indv)
{
// Filter individuals randomly until have a random subset
if (max_N_indv < 0)
return;
LOG.printLOG("Filtering Individuals Randomly\n");
if (meta_data.has_genotypes == false)
LOG.error("Require Genotypes in variant file filter individuals.");
unsigned int N_kept_indv = N_kept_individuals();
srand ( time(NULL) );
vector<unsigned int> keep_index(N_kept_indv);
int count = 0;
for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
{
if (include_indv[ui] == true)
{
keep_index[count] = ui;
count++;
}
}
random_shuffle(keep_index.begin(), keep_index.end()); // Get a random order
keep_index.resize(min(max_N_indv, (signed)keep_index.size())); // Only keep a subset
for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
{
if (include_indv[ui] == false)
continue;
bool found = false;
for (unsigned int uj=0; uj<keep_index.size(); uj++)
{
if (keep_index[uj] == ui)
{
found = true;
}
}
if (found == false)
include_indv[ui] = false;
}
}
|