File: variant_file_filters.cpp

package info (click to toggle)
vcftools 0.1.12%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 1,680 kB
  • ctags: 1,215
  • sloc: cpp: 12,118; perl: 10,973; ansic: 1,467; pascal: 1,064; makefile: 67; php: 57; sh: 12
file content (143 lines) | stat: -rw-r--r-- 3,981 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/*
 * variant_file_filters.cpp
 *
 *      Author: amarcketta
 */

#include "variant_file.h"

void variant_file::apply_filters(const parameters &params)
{
	filter_individuals(params.indv_to_keep, params.indv_to_exclude, params.indv_keep_file, params.indv_exclude_file);
	filter_individuals_randomly(params.max_N_indv);
}

void variant_file::filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude)
{
	// Filter individuals by user provided lists
	if (keep_then_exclude)
	{
		filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename);
		filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename);
	}
	else
	{
		filter_individuals_by_exclude_list(indv_to_exclude, indv_to_exclude_filename);
		filter_individuals_by_keep_list(indv_to_keep, indv_to_keep_filename);
	}
}

void variant_file::filter_individuals_by_keep_list(const set<string> &indv_to_keep, const string &indv_to_keep_filename)
{
	// Filter individuals by user provided list
	if ((indv_to_keep_filename == "") && (indv_to_keep.size() == 0))
		return;

	LOG.printLOG("Keeping individuals in 'keep' list\n");
	set<string> indv_to_keep_copy = indv_to_keep;
	if (indv_to_keep_filename != "")
	{
		ifstream infile(indv_to_keep_filename.c_str());
		if (!infile.is_open())
			LOG.error("Could not open Individual file:" + indv_to_keep_filename, 1);
		string line;
		string tmp_indv;
		stringstream ss;
		while (!infile.eof())
		{
			getline(infile, line);
			ss.str(line);
			ss >> tmp_indv;
			indv_to_keep_copy.insert(tmp_indv);
			ss.clear();
		}
		infile.close();
	}
	for (unsigned int ui=0; ui<include_indv.size(); ui++)
	{
		if (include_indv[ui] == false)
			continue;
		if (indv_to_keep_copy.find(meta_data.indv[ui]) == indv_to_keep_copy.end())
			include_indv[ui] = false;
	}
}

void variant_file::filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const string &indv_to_exclude_filename)
{
	// Filter individuals by user provided list
	if ((indv_to_exclude_filename == "") && (indv_to_exclude.size() == 0))
		return;
	LOG.printLOG("Excluding individuals in 'exclude' list\n");
	set<string> indv_to_exclude_copy = indv_to_exclude;
	if (indv_to_exclude_filename != "")
	{
		ifstream infile(indv_to_exclude_filename.c_str());
		if (!infile.is_open())
		{
			LOG.error("Could not open Individual file:" + indv_to_exclude_filename, 1);
		}
		string line;
		string tmp_indv;
		stringstream ss;
		while (!infile.eof())
		{
			getline(infile, line);
			ss.str(line);
			ss >> tmp_indv;
			indv_to_exclude_copy.insert(tmp_indv);
			ss.clear();
		}
		infile.close();
	}
	for (unsigned int ui=0; ui<include_indv.size(); ui++)
	{
		if (include_indv[ui] == false)
			continue;
		if (indv_to_exclude_copy.find(meta_data.indv[ui]) != indv_to_exclude_copy.end())
			include_indv[ui] = false;
	}
}

void variant_file::filter_individuals_randomly(int max_N_indv)
{
	// Filter individuals randomly until have a random subset
	if (max_N_indv < 0)
		return;
	LOG.printLOG("Filtering Individuals Randomly\n");

	if (meta_data.has_genotypes == false)
		LOG.error("Require Genotypes in variant file filter individuals.");

	unsigned int N_kept_indv = N_kept_individuals();

	srand ( time(NULL) );
	vector<unsigned int> keep_index(N_kept_indv);
	int count = 0;
	for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
	{
		if (include_indv[ui] == true)
		{
			keep_index[count] = ui;
			count++;
		}
	}

	random_shuffle(keep_index.begin(), keep_index.end());			// Get a random order
	keep_index.resize(min(max_N_indv, (signed)keep_index.size()));	// Only keep a subset

	for (unsigned int ui=0; ui<meta_data.N_indv; ui++)
	{
		if (include_indv[ui] == false)
			continue;
		bool found = false;
		for (unsigned int uj=0; uj<keep_index.size(); uj++)
		{
			if (keep_index[uj] == ui)
			{
				found = true;
			}
		}
		if (found == false)
			include_indv[ui] = false;
	}
}