File: HashMerge.cpp

package info (click to toggle)
uc-echo 1.12-19
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,904 kB
  • sloc: cpp: 1,405; python: 659; makefile: 44; sh: 8
file content (135 lines) | stat: -rw-r--r-- 5,119 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#include <iostream>
#include <vector>
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <set>
#include <utility>
#include <algorithm>
#include <memory>

#include "util.hpp"
#include "DNASeq.hpp"
#include "MMAPReads.hpp"
#include "KmerHashMap.hpp"

using namespace std;

int main(int argc, char** argv) {
    // Initialize input arguments.
    Options opt(argc, argv);

    assert(opt.inputFNames.size()%2 == 0);

    // Open hash files and index files
    std::vector<tr1::shared_ptr<MMAP> > hash_files;
    std::vector<tr1::shared_ptr<MMAP> > index_files;
    for(vector<char*>::iterator iter = opt.inputFNames.begin(); iter != opt.inputFNames.end(); ++iter) {
        hash_files.push_back(tr1::shared_ptr<MMAP>(new MMAP(*iter)));
        ++iter;
        index_files.push_back(tr1::shared_ptr<MMAP>(new MMAP(*iter)));
    }

    // Output hash table file.
    FILE *fout = fopen((string(opt.fpre) + opt.fsuf + ".hash").c_str(), "wb");

    // Output index file.
    FILE *findexout = fopen((string(opt.fpre) + opt.fsuf + ".index").c_str(), "wb");

    unsigned int nFiles = opt.inputFNames.size()/2;
    vector<unsigned long long> fpos(nFiles, 0); // points to index
    vector<unsigned long long> fpos2(nFiles, 0); // points to hash table

    vector<unsigned int> num_kmers_read(nFiles, 0);
    vector<unsigned int> nKmer(nFiles, 0);

    unsigned int num_tot_kmers = 0;

    for(unsigned int hash_id=0; hash_id<nFiles; ++hash_id) {
        fpos[hash_id] = 0;
        nKmer[hash_id] = *(unsigned int*)(*index_files[hash_id])[0];
        fpos[hash_id] += sizeof(unsigned int); // Advance file pointer.
        fpos2[hash_id] = 0;
    }
    
    // Leave a space for the total number of kmers at the beginning of index.
    unsigned int tmpint = 0;
    fwrite(&tmpint, sizeof(unsigned int), 1, findexout);

    // Iterate through all hash tables.
    bool all_end_reach = false;
    while(!all_end_reach) {
        // Check to see which hash files we've reached the end of.
        vector<unsigned int> good_hashes;
        for(unsigned int hash_id=0; hash_id<nFiles; ++hash_id) {
            if(num_kmers_read[hash_id] < nKmer[hash_id]) {
                good_hashes.push_back(hash_id);
            }
        }
        if(good_hashes.size() == 0) {
            all_end_reach = true;
            continue;
        }

        // Find "minimum" kmer among the hash files.
        // Initialize min_kmer to kmer in first good hash file.
        Kmer min_kmer((char*)(*index_files[*good_hashes.begin()])[fpos[*good_hashes.begin()]]);

        for(vector<unsigned int>::iterator iter=good_hashes.begin(); iter!=good_hashes.end(); ++iter) {
            Kmer kmer((char*)(*index_files[*iter])[fpos[*iter]]);
            if(Kmer_less()(kmer, min_kmer))
                min_kmer = string((char*)(*index_files[*iter])[fpos[*iter]]);
        }

        // Update total number of kmers.
        ++num_tot_kmers;
        
        // Find which hash tables have the minimum kmer
        // and adjust their fpos, etc. appropriately.
        vector<unsigned int> min_hashes;
        vector<unsigned int> nreads(nFiles);
        unsigned int tot_noccur = 0;
        for(vector<unsigned int>::iterator iter=good_hashes.begin(); iter != good_hashes.end(); ++iter) {
            Kmer kmer((char*)(*index_files[*iter])[fpos[*iter]]);
            if(Kmer_cmp()(kmer,min_kmer)) {
                min_hashes.push_back(*iter);
                Kmer kmer((char*)(*index_files[*iter])[fpos[*iter]]);
                fpos[*iter] += min_kmer.getKmer().size()+1; // Advance file ptr.

                // Get number of reads for the kmer.
                nreads[*iter]=*(unsigned int*)(*index_files[*iter])[fpos[*iter]];
                tot_noccur += nreads.at(*iter); 
                fpos[*iter] += sizeof(unsigned int); // Advance file ptr.
                ++num_kmers_read[*iter];
            }
        }

        const char* kmer_string = min_kmer.getKmer().c_str();
        // Write out index.
        //    Write kmer string.
        fwrite(kmer_string, sizeof(char), strlen(kmer_string)+1, findexout);
        //    Write number of reads with kmer.
        fwrite(&tot_noccur, sizeof(unsigned int), 1, findexout);

        // write out reads and positions
        for(vector<unsigned int>::iterator iter=min_hashes.begin(); iter != min_hashes.end(); ++iter) {
            for(unsigned int occ_count=0; occ_count<nreads[*iter]; ++occ_count) {
                unsigned int read_id = *(unsigned int*)(*hash_files[*iter])[fpos2[*iter]];
                fpos2[*iter] += sizeof(unsigned int);
                unsigned int pos = *(unsigned int*)(*hash_files[*iter])[fpos2[*iter]];
                fpos2[*iter] += sizeof(unsigned int);
                fwrite(&read_id, sizeof(unsigned int), 1, fout);
                fwrite(&pos, sizeof(unsigned int), 1, fout);
            }
        }
    }

    // Write out total number of kmers to beginning of file.
    fseek(findexout, 0L, SEEK_SET);
    fwrite(&num_tot_kmers, sizeof(unsigned int), 1, findexout);

    // Close files.
    fclose(findexout);
    fclose(fout);
}