1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
|
/*
* variant_file.h
*
* Created on: Dec 12, 2012
* Author: amarcketta
*/
#ifndef VARIANT_FILE_H_
#define VARIANT_FILE_H_
#include <algorithm>
#include <bitset>
#include <cstdlib>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <fstream>
#include <limits>
#include <set>
#include <sstream>
#include <map>
#include <numeric>
#include <stdint.h>
#include <stdio.h>
#include <string>
#include <sys/stat.h>
#include <vector>
#include <zlib.h>
#include "parameters.h"
#include "entry.h"
#include "gamma.h"
#include "vcf_entry.h"
#include "bcf_entry.h"
#include "header.h"
#ifdef VCFTOOLS_PCA
#include "dgeev.h"
#endif
extern output_log LOG;
using namespace std;
class variant_file
{
public:
string filename;
bool compressed;
istream *file_in;
ifstream file_tmp;
unsigned int gzMAX_LINE_LEN;
gzFile gzfile_in;
header meta_data;
vector<bool> include_indv;
unsigned int N_entries;
unsigned int N_kept_entries;
int N_kept_individuals() const;
int N_kept_sites() const;
int N_total_sites() const;
virtual void open() = 0;
virtual void open_gz() = 0;
virtual void close() = 0;
virtual bool eof() = 0;
virtual void get_entry(vector<char> &out) = 0;
virtual entry* get_entry_object() = 0;
void ByteSwap(unsigned char *b, int n) const;
static inline bool is_big_endian() { long one= 1; return !(*((char *)(&one))); };
void apply_filters(const parameters ¶ms);
void filter_individuals(const set<string> &indv_to_keep, const set<string> &indv_to_exclude, const string &indv_to_keep_filename, const string &indv_to_exclude_filename, bool keep_then_exclude=true);
void filter_individuals_by_keep_list(const set<string> &indv_to_keep, const string &indv_to_keep_filename);
void filter_individuals_by_exclude_list(const set<string> &indv_to_exclude, const string &indv_to_exclude_filename);
void filter_individuals_randomly(int max_N_indv);
void output_frequency(const parameters ¶ms, bool output_counts=false);
void output_individuals_by_mean_depth(const parameters ¶ms);
void output_site_depth(const parameters ¶ms, bool output_mean=true);
void output_genotype_depth(const parameters ¶ms);
void output_het(const parameters ¶ms);
void output_hwe(const parameters ¶ms);
void output_SNP_density(const parameters ¶ms);
void output_indv_missingness(const parameters ¶ms);
void output_site_missingness(const parameters ¶ms);
void output_haplotype_r2(const parameters ¶ms);
void output_genotype_r2(const parameters ¶ms);
void output_genotype_chisq(const parameters ¶ms, double min_pval);
void output_interchromosomal_genotype_r2(const parameters ¶ms);
void output_interchromosomal_haplotype_r2(const parameters & params);
void output_haplotype_r2_of_SNP_list_vs_all_others(const parameters ¶ms);
void output_genotype_r2_of_SNP_list_vs_all_others(const parameters ¶ms);
void output_singletons(const parameters ¶ms);
void output_TsTv(const parameters ¶ms);
void output_TsTv_by_count(const parameters ¶ms);
void output_TsTv_by_quality(const parameters ¶ms);
void output_TsTv_summary(const parameters ¶ms);
void output_per_site_nucleotide_diversity(const parameters ¶ms);
void output_windowed_nucleotide_diversity(const parameters ¶ms);
void output_Tajima_D(const parameters ¶ms);
void output_site_quality(const parameters ¶ms);
void output_FILTER_summary(const parameters ¶ms);
void output_kept_sites(const parameters ¶ms);
void output_removed_sites(const parameters ¶ms);
void output_LROH(const parameters ¶ms);
void output_indv_relatedness_Yang(const parameters ¶ms);
void output_indv_relatedness_Manichaikul(const parameters ¶ms);
void output_PCA(const parameters ¶ms);
void output_PCA_SNP_loadings(const parameters ¶ms);
void output_indel_hist(const parameters ¶ms);
void output_as_012_matrix(const parameters ¶ms);
void output_as_plink(const parameters ¶ms);
void output_as_plink_tped(const parameters ¶ms);
void output_BEAGLE_genotype_likelihoods(const parameters ¶ms, int GL_or_PL=0);
void output_as_IMPUTE(const parameters ¶ms);
void output_as_LDhat_phased(const parameters ¶ms);
void output_as_LDhat_unphased(const parameters ¶ms);
void output_FORMAT_information(const parameters ¶ms);
void output_weir_and_cockerham_fst(const parameters ¶ms);
void output_windowed_weir_and_cockerham_fst(const parameters ¶ms);
void output_sites_in_files(const parameters ¶ms, variant_file &diff_vcf_file);
void output_sites_in_files(const parameters ¶ms, variant_file &diff_vcf_file, map<pair<string, int>, pair<string,string> > &CHROMPOS_to_filename_pair);
void output_indv_in_files(const parameters ¶ms, variant_file &diff_vcf_file);
void output_discordance_by_site(const parameters ¶ms, variant_file &diff_vcf_file);
void output_discordance_matrix(const parameters ¶ms, variant_file &diff_vcf_file);
void output_discordance_by_indv(const parameters ¶ms, variant_file &diff_vcf_file);
void output_switch_error(const parameters ¶ms, variant_file &diff_vcf_file);
void output_INFO_for_each_site(const parameters ¶ms);
void write_stats(const parameters ¶ms);
virtual void print(const parameters ¶ms) = 0;
virtual void print_bcf(const parameters ¶ms) = 0;
void calc_hap_r2(entry *e, entry *e2, double &r2, double &D, double &Dprime, int &chr_count);
void calc_geno_r2(entry *e, entry *e2, double &r2, int &chr_count);
void calc_r2_em(entry *e, entry *e2, double &r2, int &indv_count);
void calc_geno_chisq(entry *e, entry *e2, double &chisq, double &dof, double &pval, int &indv_count);
void return_indv_union(variant_file &file2, map<string, pair< int, int> > &combined_individuals, const string &indv_ID_map_file="");
void return_site_union(variant_file &file2, const parameters ¶ms, map<pair<string, int>, pair<string, string> > &out);
void get_default_contigs(vector<string> &contig_vector);
virtual ~variant_file();
};
#endif /* VARIANT_FILE_H_ */
|