File: vcf_file.h

package info (click to toggle)
vcftools 0.1.9-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 1,396 kB
  • sloc: perl: 10,233; cpp: 7,950; pascal: 751; makefile: 60; php: 43; sh: 12
file content (189 lines) | stat: -rw-r--r-- 8,821 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/*
 * vcf_file.h
 *
 *  Created on: Aug 19, 2009
 *      Author: Adam Auton
 *      ($Revision: 249 $)
 */

#ifndef VCF_FILE_H_
#define VCF_FILE_H_

#include <cmath>
#include <cstdio>
#include <cstring>
#include <deque>
#include <iomanip>
#include <iostream>
#include <fstream>
#include <limits>
#include <set>
#include <sstream>
#include <stdint.h>
#include <string>
#include <sys/stat.h>
#include <vector>
#include <zlib.h>

#include "output_log.h"
#include "parameters.h"
#include "vcf_entry.h"

#ifdef VCFTOOLS_PCA
	#include "dgeev.h"
#endif

extern output_log LOG;

using namespace std;

class vcf_file
{
public:
	vcf_file(const string &filename, bool compressed=false, const string &chr="", const string &exclude_chr="", bool force_write_index=false);
	~vcf_file();

	const string filename;
	bool compressed;
	vector<string> meta;
	vector<string> indv;
	unsigned int N_indv;
	unsigned int N_entries;

	deque<streampos> entry_file_locations;
	void get_vcf_entry(unsigned int entry_num, string &out);

	vector<bool> include_indv;
	deque<bool> include_entry;
	deque<vector<bool> > include_genotype;

	void apply_filters(const parameters &params);

	void filter_sites(const set<string> &snps_to_keep, const string &snps_to_keep_file, const string &snps_to_exclude_file, bool keep_then_exclude = false);
	void filter_sites_to_keep(const set<string> &snps_to_keep, const string &snps_to_keep_file);
	void filter_sites_to_exclude(const string &snps_to_exclude_file);
	void filter_sites_by_position(const string &chr, int start_pos, int end_pos);
	void filter_sites_by_positions(const string &positions_file, const string &exclude_positions_file);
	void filter_sites_by_quality(double min_quality);
	void filter_sites_by_mean_depth(double min_mean_depth, double max_mean_depth);
	void filter_sites_by_frequency_and_call_rate(double min_maf, double max_maf, double min_non_ref_af, double max_non_ref_af, double min_site_call_rate);
	void filter_sites_by_allele_type(bool keep_only_indels, bool remove_indels);
	void filter_sites_by_allele_count(double min_mac, double max_mac, double min_non_ref_ac, double max_non_ref_ac, double max_missing_call_count);
	void filter_sites_by_number_of_alleles(int min_alleles, int max_alleles);
	void filter_sites_by_HWE_pvalue(double min_HWE_pvalue);
	void filter_sites_by_BED_file(const string &bed_file, bool BED_exclude = false);
	void filter_sites_by_mask(const string &mask_file, bool invert_mask = false, int min_kept_mask_value=0);
	void filter_sites_by_filter_status(const set<string> &filter_flags_to_remove, const set<string> &filter_flags_to_keep, bool remove_all = false);
	void filter_sites_by_phase();
	void filter_sites_by_thinning(int min_SNP_distance);
	void filter_sites_by_INFO_flags(const set<string> &flags_to_remove, const set<string> &flags_to_keep);

	void filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude=true);
	void filter_individuals_by_keep_list(const set<string> &indv_to_keep, const string &indv_to_keep_filename);
	void filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const string &indv_to_exclude_filename);
	void filter_individuals_by_call_rate(double min_call_rate);
	void filter_individuals_by_mean_depth(double min_mean_depth, double max_mean_depth);
	void filter_individuals_by_phase();
	void filter_individuals_randomly(int max_N_indv);

	void filter_genotypes_by_quality(double min_genotype_quality);
	void filter_genotypes_by_depth(int min_depth, int max_depth);
	void filter_genotypes_by_filter_flag(const set<string> &filter_flags_to_remove, bool remove_all = false);

	void output_frequency(const string &output_file_prefix, bool output_counts=false, bool suppress_allele_output=false);
	void output_individuals_by_mean_depth(const string &output_file_prefix);
	void output_site_depth(const string &output_file_prefix, bool output_mean=true);
	void output_genotype_depth(const string &output_file_prefix);
	void output_het(const string &output_file_prefix);
	void output_hwe(const string &output_file_prefix);
	void output_SNP_density(const string &output_file_prefix, int bin_size);
	void output_missingness(const string &output_file_prefix);
	void output_genotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2);
	void output_interchromosomal_genotype_r2(const string &output_file_prefix, double min_r2=0.1);
	void output_haplotype_r2(const string &output_file_prefix, int snp_window_size, int bp_window_size, double min_r2);
	void output_singletons(const string &output_file_prefix);
	void output_TsTv(const string &output_file_prefix, int bin_size);
	void output_TsTv_by_count(const string &output_file_prefix);
	void output_TsTv_by_quality(const string &output_file_prefix);
	void output_per_site_nucleotide_diversity(const string &output_file_prefix);
	void output_windowed_nucleotide_diversity(const string &output_file_prefix, int window_size);
	void output_Tajima_D(const string &output_file_prefix, int window_size);
	void output_site_quality(const string &output_file_prefix);
	void output_FILTER_summary(const string &output_file_prefix);
	void output_kept_and_removed_sites(const string &output_file_prefix);
	void output_LROH(const string &output_file_prefix);
	void output_indv_relatedness(const string &output_file_prefix);
	void output_PCA(const string &output_file_prefix, bool use_normalisation=true, int SNP_loadings_N_PCs=-1);

	void output_as_012_matrix(const string &output_file_prefix);
	void output_as_plink(const string &output_file_prefix);
	void output_as_plink_tped(const string &output_file_prefix);
	void output_BEAGLE_genotype_likelihoods(const string &output_file_prefix);
	void output_as_IMPUTE(const string &output_file_prefix);
	void output_as_LDhat_phased(const string &output_file_prefix, const string &chr);
	void output_as_LDhat_unphased(const string &output_file_prefix, const string &chr);
	void output_LDhat_locs_file(const string &output_file_prefix, const string &chr, unsigned int &n_sites_out);
	void output_FORMAT_information(const string &output_file_prefix, const string &FORMAT_id);

	void output_hapmap_fst_from_two_files(const string &output_file_prefix, vcf_file &vcf_fst);
	void output_hapmap_fst(const string &output_file_prefix, const vector<string> &indv_files);
	void output_weir_and_cockerham_fst(const string &output_file_prefix, const vector<string> &indv_files);

	void output_sites_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file);
	void output_indv_in_files(const string &output_file_prefix, vcf_file &diff_vcf_file);
	void output_discordance_by_site(const string &output_file_prefix, vcf_file &diff_vcf_file);
	void output_discordance_matrix(const string &output_file_prefix, vcf_file &diff_vcf_file);
	void output_discordance_by_indv(const string &output_file_prefix, vcf_file &diff_vcf_file);
	void output_switch_error(const string &output_file_prefix, vcf_file &diff_vcf_file);

	void output_INFO_for_each_site(const string &output_file_prefix, const vector<string> &INFO_to_extract);

	void print(ostream &out, const set<string> &INFO_to_keep, bool keep_all_INFO);
	void print(const string &output_file_prefix, const set<string> &INFO_to_keep, bool keep_all_INFO=false);

	int N_kept_individuals() const;
	int N_kept_sites() const;
	unsigned int N_genotypes_included(unsigned int entry_num) const;

private:
	ifstream vcf_in;
	gzFile gzvcf_in;
	char *gz_readbuffer;
	unsigned int gzMAX_LINE_LEN;
	void open();
	void close();
	bool feof();
	inline void read_line(string &out);
	inline void read_CHROM_only(string &CHROM);
	void read_CHROM_and_POS_only(string &CHROM, int &POS);
	inline void read_CHROM_and_POS_and_skip_remainder_of_line(string &CHROM, int &POS);
	void parse_header(const string &line);
	void parse_meta(const string &line);
	streampos get_filepos();
	void set_filepos(streampos &filepos);

	bool has_body;
	bool has_file_format;
	bool has_genotypes;
	bool has_header;
	bool has_meta;

	void scan_file(const string &chr="", const string &exclude_chr="", bool force_write_index=false);
	inline char peek();

	void return_indv_union(vcf_file &file2, map<string, pair< int, int> > &combined_individuals);
	void return_site_union(vcf_file &file2, map<pair<string, int>, pair<int, int> > &out);

	bool read_index_file(const string &index_filename);
	void write_index_file(const string &index_filename);

	void ByteSwap(unsigned char *b, int n) const;
	int idx_read(gzFile &in, void *buffer, unsigned int len, size_t size);
	void idx_write(gzFile &out, void *buffer, unsigned int len, size_t size);

	bool big_endian_machine;
	static inline bool is_big_endian() { long one= 1; return !(*((char *)(&one))); };

};

#endif /* VCF_FILE_H_ */