File: kmerdist.cpp

package info (click to toggle)
mothur 1.48.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 13,692 kB
  • sloc: cpp: 161,866; makefile: 122; sh: 31
file content (98 lines) | stat: -rw-r--r-- 3,239 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
//
//  kmerdist.cpp
//  Mothur
//
//  Created by Sarah Westcott on 3/29/21.
//  Copyright © 2021 Schloss Lab. All rights reserved.
//

#include "kmerdist.hpp"
#include "kmer.hpp"

/***********************************************************************/
KmerDist::KmerDist(int k) {
    try {
        m = MothurOut::getInstance();
        
        int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
        
        kmerSize = k;
        maxKmer = power4s[kmerSize];
    }
    catch(exception& e) {
        m->errorOut(e, "KmerDist", "KmerDist");
        exit(1);
    }
}
/***********************************************************************/

double KmerDist::calcDist(Sequence A, Sequence B){
    try {
        string seqA = A.getUnaligned();
        string seqB = B.getUnaligned();
        
        int numKmers = min(seqA.length(), seqB.length()) - kmerSize + 1;
        
        Kmer kmer(kmerSize);
        
        int numSeqAKmers = seqA.length() - kmerSize + 1;
            
        vector<int> seqAKmers(maxKmer+1,0);
        for(int j=0;j<numSeqAKmers;j++){                        //    ...step though the sequence and get each kmer...
            int kmerNumber = kmer.getKmerNumber(seqA, j);
            seqAKmers[kmerNumber] = 1;
        }    
        
        int numSeqBKmers = seqB.length() - kmerSize + 1;
            
        int numMatchingKmers = 0;
        vector<int> seqBKmers(maxKmer+1,0);
        for(int j=0;j<numSeqBKmers;j++){                        //    ...step though the sequence and get each kmer...
            int kmerNumber = kmer.getKmerNumber(seqB, j);
            if ((seqBKmers[kmerNumber] == 0) && (seqAKmers[kmerNumber] == 1)) { //this kmer is present in seqA, and we haven't already counted it
                numMatchingKmers++;
            }
            seqBKmers[kmerNumber] = 1;
        }
        
        double dist = abs(log(0.1 + (numMatchingKmers / (float) numKmers)));
            
        return dist;
    }
    catch(exception& e) {
        m->errorOut(e, "KmerDist", "calcDist");
        exit(1);
    }
}
/***********************************************************************/

vector<double> KmerDist::calcDist(vector<kmerCount> A, vector<int> B, int length){
    try {
        int numAKmers = 0;
        int numMatchingKmers = 0;
        int numUniqueMatchingKmers = 0;
        
        int numKmers = length - kmerSize + 1;
        
        for (int i = 0; i < A.size(); i++) {
            
            numAKmers += A[i].kCount;
            
            if (B[A[i].kmerNumber] != 0) { //does sequence B contain this kmer
                numMatchingKmers += min(A[i].kCount, B[A[i].kmerNumber]);
                numUniqueMatchingKmers++;
            }
        }
    
        double dist1 = log(1.0 - (numMatchingKmers / (float) numKmers));
        double dist2 = log(1.0 - (numUniqueMatchingKmers / (float) numKmers));
              
        vector<double> dists; dists.push_back(dist1); dists.push_back(dist2);
        return dists;
    }
    catch(exception& e) {
        m->errorOut(e, "KmerDist", "calcDist");
        exit(1);
    }
}
/***********************************************************************/