1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
#ifndef __KMERCOUNTER__
#define __KMERCOUNTER__
#include <vector>
#include <map>
#include <string>
#include <set>
#include "sequenceUtil.hpp"
using namespace std;
struct eqstr {
bool operator()(const kmer_int_type_t& kmer_val_a, const kmer_int_type_t& kmer_val_b) const {
return( kmer_val_a == kmer_val_b);
}
};
struct hashme {
kmer_int_type_t operator() (const kmer_int_type_t& kmer_val) const {
return(kmer_val);
}
};
#ifdef __GOOGLE__
// #warning "******** using GOOGLE SPARSEHASH for Kmer graph *********"
#include <google/sparse_hash_map>
using google::sparse_hash_map;
typedef sparse_hash_map<kmer_int_type_t, unsigned int, hashme, eqstr> Kmer_counter_map;
typedef sparse_hash_map<kmer_int_type_t, unsigned int, hashme, eqstr>::iterator Kmer_counter_map_iterator;
typedef sparse_hash_map<kmer_int_type_t, unsigned int, hashme, eqstr>::const_iterator Kmer_counter_map_const_iterator;
#elif defined(__SUNPRO_CC) // Solaris Studio compiler
#include <hash_map>
typedef std::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr> Kmer_counter_map;
typedef std::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr>::iterator Kmer_counter_map_iterator;
typedef std::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr>::const_iterator Kmer_counter_map_const_iterator;
#else
#include <ext/hash_map>
typedef __gnu_cxx::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr> Kmer_counter_map;
typedef __gnu_cxx::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr>::iterator Kmer_counter_map_iterator;
typedef __gnu_cxx::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr>::const_iterator Kmer_counter_map_const_iterator;
#endif
typedef pair<kmer_int_type_t,unsigned int> Kmer_Occurence_Pair;
class KmerCounter {
public:
KmerCounter() {};
KmerCounter(unsigned int kmer_length, bool is_ds=false);
unsigned int get_kmer_length();
unsigned long size();
void add_sequence(string& sequence, unsigned int cov=1);
bool add_kmer (kmer_int_type_t, unsigned int count);
bool add_kmer (string kmer, unsigned int count);
void describe_kmers();
void dump_kmers_to_file(string& outfilename);
//vector<Kmer_counter_map_iterator> get_kmers_sort_descending_counts();
vector<Kmer_Occurence_Pair> get_kmers_sort_descending_counts();
Kmer_counter_map_iterator find_kmer(kmer_int_type_t kmer_val);
bool kmer_exists(string kmer);
bool kmer_exists(kmer_int_type_t kmer_val);
unsigned int get_kmer_count(string kmer);
unsigned int get_kmer_count(kmer_int_type_t kmer_val);
string describe_kmer(string& kmer);
string get_kmer_string(kmer_int_type_t kmer_val);
kmer_int_type_t get_kmer_intval(string kmer);
bool prune_kmer(string kmer); // remove kmer from map
bool prune_kmer(kmer_int_type_t kmer_val);
bool prune_some_kmers(unsigned int min_count, float min_entropy, bool prune_error_kmers, float min_ratio_non_error);
bool prune_branched_kmers();
void prune_kmers_min_count(unsigned int count);
void prune_kmers_min_entropy(float min_entropy);
bool clear_kmer(kmer_int_type_t kmer_val);
// methods return kmers sorted descending by count.
//vector<string> get_reverse_kmer_candidates(string& kmer);
vector<Kmer_Occurence_Pair> get_reverse_kmer_candidates(kmer_int_type_t seed_kmer);
//vector<string> get_forward_kmer_candidates(string& kmer);
vector<Kmer_Occurence_Pair> get_forward_kmer_candidates(kmer_int_type_t seed_kmer);
// methods return kmers unsorted, in order G,A,T,C
//vector<string> get_reverse_kmer_candidates_unsorted(string& kmer);
vector<Kmer_Occurence_Pair> get_reverse_kmer_candidates_unsorted(kmer_int_type_t seed_kmer, bool getZeros);
//vector<string> get_forward_kmer_candidates_unsorted(string& kmer);
vector<Kmer_Occurence_Pair> get_forward_kmer_candidates_unsorted(kmer_int_type_t seed_kmer, bool getZeros);
// get the simple list of the 4 possible kmer extensions.
kmer_int_type_t* get_forward_kmer_candidates_noLookup(kmer_int_type_t seed_kmer, kmer_int_type_t forward_kmer_array_size_4 [4]);
kmer_int_type_t* get_reverse_kmer_candidates_noLookup(kmer_int_type_t seed_kmer, kmer_int_type_t reverse_kmer_array_size_4 [4]);
bool prune_kmer_extensions( float min_ratio_non_error);
const Kmer_counter_map& get_kmer_counter_map() const;
private:
unsigned int _kmer_length;
Kmer_counter_map _kmer_counter;
bool _DS_MODE;
};
class Kmer_visitor {
public:
Kmer_visitor(unsigned int kmer_length, bool is_ds);
void add (kmer_int_type_t kmer);
bool exists (kmer_int_type_t kmer);
void erase (kmer_int_type_t kmer);
void clear();
unsigned int size();
private:
unsigned int _kmer_length;
bool _DS_MODE;
set<kmer_int_type_t> _set;
};
class Sort_kmer_by_count_desc {
public:
Sort_kmer_by_count_desc(KmerCounter *kcounter);
bool operator() (const Kmer_counter_map_iterator& i, const Kmer_counter_map_iterator& j);
bool operator() (const Kmer_Occurence_Pair& i, const Kmer_Occurence_Pair& j);
bool operator() (const kmer_int_type_t& val_i, const kmer_int_type_t& val_j);
bool operator() (const string& i, const string& j);
private:
KmerCounter *kcounter;
};
#endif
|