File: KmerHashMap.hpp

package info (click to toggle)
uc-echo 1.12-7
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 1,772 kB
  • ctags: 372
  • sloc: cpp: 1,405; python: 660; makefile: 48; sh: 1
file content (317 lines) | stat: -rw-r--r-- 11,621 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#ifndef __KMERHASHMAP_H__
#define __KMERHASHMAP_H__

#include <tr1/tuple>
#include <set>
#include <iostream>
#include <vector>
#include <sstream>
#include <memory>
#include <map>

#include <cstring>
#include <cstdio>
#include <cassert>

#include "DNASeq.hpp"
#include "MMAP.hpp"

struct KmerOccurrence {
    unsigned int readID, pos;

    KmerOccurrence(unsigned int readID, unsigned pos) : readID(readID),pos(pos) {}
    KmerOccurrence() {} 
};

class KmerHashMap {
    static const unsigned int MaxBinSize;

    // Maps Kmers to (read_id, kmer_hit_pos) using Kmer_less comparator.
    //
    // *** Change hashing structure here ***
    // If you use a hashing structure that does not store the objects
    // in sorted order, then you will need to sort the kmers later
    // as noted below.
    std::map<Kmer, std::vector<KmerOccurrence>, Kmer_less> Data;

    std::vector<KmerOccurrence> dummy;
    unsigned int ihash_st, ihash_ed, nhash;

    public:
    KmerHashMap() : ihash_st(0),ihash_ed(1),nhash(1) {};
    KmerHashMap(unsigned int ihash_st, unsigned int ihash_ed, unsigned int nhash) : ihash_st(ihash_st), ihash_ed(ihash_ed), nhash(nhash) {};

    // Retrieve all reads that have the given kmer.
    std::vector<KmerOccurrence>& operator[](const Kmer& kmer) {
        if(dummy.size()>=MaxBinSize) dummy.clear();
        // Only accept if KmerHash falls within the right bucket
        // (which is determined below)
        // and the size of the bin is correct.
        // Otherwise, return a dummy vector.

        // 'batch' is which batch of blocks the kmer belongs to.
        // The hash table is divided into blocks.
        // The blocks are grouped into batches and processed together. 
        unsigned int batch = Kmer_hash()(kmer) % nhash;

        if(batch<ihash_st || batch>=ihash_ed) return dummy;
        std::vector<KmerOccurrence>& occ = Data[kmer];
        if(occ.size()>=MaxBinSize) return dummy;
        return occ;
    };

    void dump_binary(char* prefix, char* suffix) {
        // Number of blocks in a batch.
        assert(ihash_ed > ihash_st);
        const unsigned int nbatch = ihash_ed - ihash_st;
        FILE* fout[nbatch];
        FILE* findexout[nbatch];
        for(unsigned int i=0; i<nbatch; i++) {
            std::ostringstream fname;
            std::ostringstream findexname;
            // prefix is tmpDIR.
            // suffix is the starting read number.
            fname << prefix << suffix << ".hash";
            findexname << prefix << suffix << ".index";
            fout[i] = fopen(fname.str().c_str(), "wb");
            findexout[i] = fopen(findexname.str().c_str(), "wb");
        }

        // compute index
        std::vector<std::vector<std::tr1::tuple<Kmer, unsigned int> > > index(nbatch);
        for(unsigned int i=0; i<nbatch; i++)
            index[i].push_back(std::tr1::make_tuple(Kmer("Z"), 0));

        for(std::map<Kmer, std::vector<KmerOccurrence>, Kmer_less>::iterator iter=Data.begin(); iter!=Data.end(); iter++) {
            // ibatch is which block the kmer is in (misnomer, should be iblock instead)
            unsigned int ibatch = Kmer_hash()(iter->first) % nhash;
            assert(ibatch>=ihash_st && ibatch<ihash_ed);
            // fnum is the offset of the block within its batch.
            unsigned int fnum = ibatch - ihash_st;
            // (kmer, # reads that have kmer)
            index[fnum].push_back(std::tr1::make_tuple(iter->first, iter->second.size()));
        }

        // Remove the (Kmer("Z"),0) tuple.
        for(unsigned int i=0; i<nbatch; i++)
            index[i].erase(index[i].begin());

        // ***
        // NOTE: Might need to sort the kmers here. 
        // This is only necessary if we don't use a std::map
        // because iterating through std::map iterates in sorted order
        // ***

        // Write out index.
        // Format: kmer string, number of reads with kmer
        // nbatch is the number of blocks within a single batch.
        for(unsigned int i=0; i<nbatch; i++) {
            // Writes out:
            // num kmers in block, (kmer, num reads with kmer)+
            unsigned int nKmer = index[i].size();
            fwrite(&nKmer, sizeof(unsigned int), 1, findexout[i]); // number of kmers
            for(unsigned int kid=0; kid<nKmer; kid++) { // index of kmer
                const char* kmer = std::tr1::get<0>(index[i][kid]).getKmer().c_str();
                unsigned int nread = std::tr1::get<1>(index[i][kid]);
                fwrite(kmer, sizeof(char), strlen(kmer)+1, findexout[i]);
                fwrite(&nread, sizeof(unsigned int), 1, findexout[i]);
            }
        }

        // Write out reads.
        // Format: read ID, hit position
        for(unsigned int i=0; i<nbatch; i++) {
            // Writes out:
            // read id, pos within read
            unsigned int nKmer = index[i].size();
            for(unsigned int kid=0; kid<nKmer; kid++) {
                Kmer kmer = std::tr1::get<0>(index[i][kid]);
                std::vector<KmerOccurrence>& occur_list = Data[kmer];
                for(std::vector<KmerOccurrence>::iterator occ = occur_list.begin(); occ!=occur_list.end(); occ++) {
                    const unsigned int& readid = occ->readID;
                    const unsigned int& pos = occ->pos;
                    fwrite(&readid, sizeof(unsigned int), 1, fout[i]);
                    fwrite(&pos, sizeof(unsigned int), 1, fout[i]);		    
                }
            }
        }

        for(unsigned int i=0; i<nbatch; i++) {
            fclose(fout[i]);
            fclose(findexout[i]);
        }
    }
}; // end KmerHashMap

class HashMMAP {
    MMAP hash_mmap;
    MMAP index_mmap;

    unsigned int nKmer;

    unsigned long long fpos; // index pointer
    unsigned long long fpos2; // hash pointer

    unsigned long long fbegin; // start of hash block
    unsigned long long fend; // end of hash block

    unsigned long long f2begin; // start of hash block
    unsigned long long f2end; // end of hash block

    public:
    class ConstReadIterator {
        MMAP const &hash_mmap;
        unsigned long long fpos2;

        public:
        ConstReadIterator(MMAP const& hash_mmap, unsigned long long fpos2) : hash_mmap(hash_mmap), fpos2(fpos2) {}

        inline unsigned int getReadID() const {
            return *(unsigned int*)hash_mmap[fpos2];
            if(hash_mmap[fpos2] == NULL) {
                std::cout << "ConstReadIterator:: getReadID() out of bounds." << std::endl;
            }
        }

        inline unsigned int getPos() const {
            if(hash_mmap[fpos2+sizeof(int)] == NULL) {
                std::cout << "ConstReadIterator: getPos() out of bounds." << std::endl;
            }
            return *(unsigned int*)hash_mmap[fpos2+sizeof(unsigned int)];
        }

        inline ConstReadIterator& operator++() {
            fpos2 += 2 * sizeof(unsigned int);
            return *this;
        }

        inline ConstReadIterator& operator++(int) {
            operator++();
            return *this;
        }

        inline bool operator==(ConstReadIterator const &a) const {
            return fpos2 == a.fpos2;
        }

        inline bool operator!=(ConstReadIterator const &a) const {
            return !(fpos2 == a.fpos2);
        }
    };

    class ConstIterator {
        MMAP const &hash_mmap;
        MMAP const &index_mmap; 

        unsigned long long fpos;
        unsigned long long fpos2;

        public:
        ConstIterator(MMAP const& hash_mmap, MMAP const& index_mmap, unsigned long long fpos, unsigned long long fpos2) : hash_mmap(hash_mmap), index_mmap(index_mmap), fpos(fpos), fpos2(fpos2) {}

        inline Kmer getKmer() const {
            if(index_mmap[fpos] == NULL) {
                std::cout << "getKmer() out of bounds." << std::endl;
            }
            return Kmer((char*)index_mmap[fpos]);
        }

        inline unsigned int getOccur() const {
            if(index_mmap[fpos] == NULL) {
                std::cout << "getOccur() out of bounds." << std::endl;
            }
            return *(unsigned int*)index_mmap[strlen((char*)index_mmap[fpos]) + 1 + fpos];
        }

        inline ConstReadIterator read_begin() const {
            return ConstReadIterator(hash_mmap, fpos2);
        }

        inline ConstReadIterator read_end() const {
            return ConstReadIterator(hash_mmap, fpos2 + 2*getOccur() * sizeof(unsigned int));
        }

        inline ConstIterator& operator++() {
            int offset = strlen((char*)index_mmap[fpos]) + 1;
            if(index_mmap[offset + fpos] == NULL) {
                std::cout << "operator++() out of bounds." << std::endl;
            }
            fpos2 +=  2 * (*(unsigned int*)index_mmap[offset + fpos]) * sizeof(unsigned int);
            fpos += offset + sizeof(unsigned int);
            return *this;
        }

        inline ConstIterator& operator++(int) {
            operator++();
            return *this;
        }

        // Equality depends only on where the index pointer (fpos) points to.
        inline bool operator==(ConstIterator const& a) const {
            return fpos == a.fpos;
        }

        inline bool operator!=(ConstIterator const& a) const {
            return !(fpos == a.fpos);
        }
    };

    HashMMAP(char* fname, char* findex) : hash_mmap(fname), index_mmap(findex), fpos(0), fpos2(0) {
        nKmer = *(unsigned int*)index_mmap[0];
        fpos = sizeof(unsigned int); 

        fbegin = sizeof(unsigned int);
        fend = index_mmap.size();
        f2begin = 0;
        f2end = hash_mmap.size(); 
    }

    HashMMAP(char* fname, char* findex, unsigned int ihash_st, unsigned int ihash_ed) : hash_mmap(fname), index_mmap(findex), fpos(0), fpos2(0), fbegin(0), fend(0), f2begin(0), f2end(0) {
        nKmer = *(unsigned int*)index_mmap[0];
        // I assume all kmers are the same length so just get the
        // length of first kmer.
        unsigned int len_kmer = strlen((char*)index_mmap[sizeof(unsigned int)])+1;
        fpos = fbegin = (len_kmer+sizeof(unsigned int)) * ihash_st + sizeof(unsigned int);
        fend = (len_kmer+sizeof(unsigned int)) * ihash_ed + sizeof(unsigned int);
        assert(fend <= index_mmap.size());

        // Set fpos2 to the correct position.
        // Start from beginning of the index.
        unsigned long long tmpfpos = sizeof(unsigned int);
        fpos2 = 0;
        while(tmpfpos < fbegin) {
            tmpfpos += len_kmer; // Skip the kmer string.
            // Read the number of occurrences.
            unsigned int noccur = *(unsigned int*)index_mmap[tmpfpos]; 
            fpos2 += noccur * 2 * sizeof(unsigned int);
            tmpfpos += sizeof(unsigned int);
        }
        assert(tmpfpos == fbegin);

        // Set f2begin
        f2begin = fpos2;

        // Find end position.
        // *Continue* from tmpfpos, which was set above.
        // (Do not change ordering of these while loops.)
        f2end = fpos2;
        while(tmpfpos < fend) {
            tmpfpos += len_kmer; // Skip the kmer string.
            // Read the number of occurrences.
            unsigned int noccur = *(unsigned int*)index_mmap[tmpfpos];
            f2end += noccur * 2 * sizeof(unsigned int);
            tmpfpos += sizeof(unsigned int);
        }
        assert(f2end <= hash_mmap.size());
    }

    inline ConstIterator begin() const {
        return ConstIterator(hash_mmap, index_mmap, fbegin, f2begin);
    }

    inline ConstIterator end() const {
        return ConstIterator(hash_mmap, index_mmap, fend, f2end);
    }
};

#endif