File: KmerCounter.hpp

package info (click to toggle)
trinityrnaseq 2.11.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 417,528 kB
  • sloc: perl: 48,420; cpp: 17,749; java: 12,695; python: 3,124; sh: 1,030; ansic: 983; makefile: 688; xml: 62
file content (184 lines) | stat: -rw-r--r-- 5,334 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#ifndef __KMERCOUNTER__
#define __KMERCOUNTER__

#include <vector>
#include <map>
#include <string>
#include <set>


#include "sequenceUtil.hpp"


using namespace std;



struct eqstr {

  bool operator()(const kmer_int_type_t& kmer_val_a, const kmer_int_type_t& kmer_val_b) const {
	return( kmer_val_a == kmer_val_b);
  }
};


struct hashme {
  
  kmer_int_type_t operator() (const kmer_int_type_t& kmer_val) const {

	return(kmer_val);
  }
};


#ifdef __GOOGLE__

// #warning "******** using GOOGLE SPARSEHASH for Kmer graph *********"

#include <google/sparse_hash_map>
using google::sparse_hash_map; 

typedef sparse_hash_map<kmer_int_type_t, unsigned int, hashme, eqstr> Kmer_counter_map;
typedef sparse_hash_map<kmer_int_type_t, unsigned int, hashme, eqstr>::iterator Kmer_counter_map_iterator;
typedef sparse_hash_map<kmer_int_type_t, unsigned int, hashme, eqstr>::const_iterator Kmer_counter_map_const_iterator;

#elif defined(__SUNPRO_CC) // Solaris Studio compiler

#include <hash_map>
typedef std::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr> Kmer_counter_map;
typedef std::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr>::iterator Kmer_counter_map_iterator;
typedef std::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr>::const_iterator Kmer_counter_map_const_iterator;

#else
#include <ext/hash_map>

typedef __gnu_cxx::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr> Kmer_counter_map;
typedef __gnu_cxx::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr>::iterator Kmer_counter_map_iterator;
typedef __gnu_cxx::hash_map<kmer_int_type_t,unsigned int, hashme, eqstr>::const_iterator Kmer_counter_map_const_iterator;

#endif

typedef pair<kmer_int_type_t,unsigned int> Kmer_Occurence_Pair;


class KmerCounter {
  
public:
  
    KmerCounter() {};
    KmerCounter(unsigned int kmer_length, bool is_ds=false);
    
    unsigned int get_kmer_length();
    unsigned long size();
    
    void add_sequence(string& sequence, unsigned int cov=1);
    
    bool add_kmer (kmer_int_type_t, unsigned int count);
    bool add_kmer (string kmer, unsigned int count);
    
    void describe_kmers();
    void dump_kmers_to_file(string& outfilename);
    
    //vector<Kmer_counter_map_iterator> get_kmers_sort_descending_counts();
    vector<Kmer_Occurence_Pair> get_kmers_sort_descending_counts();
    
    Kmer_counter_map_iterator find_kmer(kmer_int_type_t kmer_val);
    
    bool kmer_exists(string kmer);
    bool kmer_exists(kmer_int_type_t kmer_val);
    
    unsigned int get_kmer_count(string kmer);
    unsigned int get_kmer_count(kmer_int_type_t kmer_val);
    
    string describe_kmer(string& kmer);
    
    string get_kmer_string(kmer_int_type_t kmer_val);
    kmer_int_type_t get_kmer_intval(string kmer);
    
    
    bool prune_kmer(string kmer); // remove kmer from map
    bool prune_kmer(kmer_int_type_t kmer_val);
    bool prune_some_kmers(unsigned int min_count, float min_entropy, bool prune_error_kmers, float min_ratio_non_error);

    bool prune_branched_kmers();
    
    void prune_kmers_min_count(unsigned int count);
    void prune_kmers_min_entropy(float min_entropy);
    
    bool clear_kmer(kmer_int_type_t kmer_val);
    
    // methods return kmers sorted descending by count.
    //vector<string> get_reverse_kmer_candidates(string& kmer);
    vector<Kmer_Occurence_Pair> get_reverse_kmer_candidates(kmer_int_type_t seed_kmer);
    //vector<string> get_forward_kmer_candidates(string& kmer);
    vector<Kmer_Occurence_Pair> get_forward_kmer_candidates(kmer_int_type_t seed_kmer);
    
    // methods return kmers unsorted, in order G,A,T,C
    //vector<string> get_reverse_kmer_candidates_unsorted(string& kmer);
    vector<Kmer_Occurence_Pair> get_reverse_kmer_candidates_unsorted(kmer_int_type_t seed_kmer, bool getZeros);
    //vector<string> get_forward_kmer_candidates_unsorted(string& kmer);
    vector<Kmer_Occurence_Pair> get_forward_kmer_candidates_unsorted(kmer_int_type_t seed_kmer, bool getZeros);
    
    // get the simple list of the 4 possible kmer extensions.
    kmer_int_type_t* get_forward_kmer_candidates_noLookup(kmer_int_type_t seed_kmer, kmer_int_type_t forward_kmer_array_size_4 [4]);    
    kmer_int_type_t* get_reverse_kmer_candidates_noLookup(kmer_int_type_t seed_kmer, kmer_int_type_t reverse_kmer_array_size_4 [4]);

    
    bool prune_kmer_extensions( float min_ratio_non_error);
    
    const Kmer_counter_map& get_kmer_counter_map() const;
    
  
  
private:
  
  unsigned int _kmer_length;
  
  Kmer_counter_map _kmer_counter;

  bool _DS_MODE;

};



class Kmer_visitor {
public:
	Kmer_visitor(unsigned int kmer_length, bool is_ds);
	void add (kmer_int_type_t kmer);
	bool exists (kmer_int_type_t kmer);
	void erase (kmer_int_type_t kmer);
	void clear();
	unsigned int size();

private:
	unsigned int _kmer_length;
	bool _DS_MODE;
	set<kmer_int_type_t> _set;
};



class Sort_kmer_by_count_desc {
  
public:
  
  Sort_kmer_by_count_desc(KmerCounter *kcounter);
  
  bool operator() (const Kmer_counter_map_iterator& i, const Kmer_counter_map_iterator& j);

  bool operator() (const Kmer_Occurence_Pair& i, const Kmer_Occurence_Pair& j);

  bool operator() (const kmer_int_type_t& val_i, const kmer_int_type_t& val_j);
  
  bool operator() (const string& i, const string& j);

private:
  
  KmerCounter *kcounter;

};



#endif