File: DNASeq.hpp

package info (click to toggle)
uc-echo 1.12-19
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,904 kB
  • sloc: cpp: 1,405; python: 659; makefile: 44; sh: 8
file content (184 lines) | stat: -rw-r--r-- 4,379 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#ifndef __DNASEQ_H__
#define __DNASEQ_H__

#include <string>
#include <stdexcept>
#include <tr1/unordered_map>
#include <tr1/memory>
#include <algorithm>

class DNASeq {
    friend class Kmer;
    friend class Kmer_cmp;
    friend class Kmer_hash; 

    static std::tr1::unordered_map<char, char> ComplementBase;

    const char* seq;
    int seq_len;

    static std::tr1::unordered_map<char, char> ComplementBaseInitializer() {
        std::tr1::unordered_map<char, char> ComplementBase;
        ComplementBase['A'] = 'T';
        ComplementBase['C'] = 'G';
        ComplementBase['G'] = 'C';
        ComplementBase['T'] = 'A';
        return ComplementBase;
    }

    public:
    DNASeq() : seq(NULL), seq_len(0) {};
    DNASeq(const char* s) : seq(NULL), seq_len(0) {
        setSeq(s);
    }
    inline const char* getSeq() const {
        return seq;
    }
    inline void setSeq(const char* s) {
        if(s!=NULL) {
            seq = s;
            seq_len = strlen(seq);
        }
    }
    inline int size() const {
        return seq_len;
    }
    inline bool operator==(DNASeq const& a) const {
        return (a.seq != NULL) && (seq != NULL ) && (strcmp(seq, a.seq)) && (seq_len == a.seq_len);
    }
};

class Kmer {
    friend class Kmer_cmp;
    friend class Kmer_less;    
    friend class Kmer_hash;

    std::string kmer;
    unsigned int hash;

    void updateHash(){
        const int hash_basegroup = sizeof(unsigned int)*8/2;
        hash=0;
        for(size_t i=0; i<kmer.size(); i++) {
            switch(kmer[i]) {
                case 'A':
                    hash+=0<<((i%hash_basegroup)*2);
                    break;
                case 'C':
                    hash+=1<<((i%hash_basegroup)*2);
                    break;
                case 'G':
                    hash+=2<<((i%hash_basegroup)*2);
                    break;
                case 'T':
                    hash+=3<<((i%hash_basegroup)*2);
                    break;
                case 'Z':
                case 'N':
                    break;
                default:
                    throw std::runtime_error("Kmer::updateHash(): Kmer contains non-base character");
            }
        }
    }

    public:
    // Create Kmer from DNASeq and pos, kmer_len
    Kmer(std::tr1::shared_ptr<const DNASeq> seq, int pos, int len) {
        kmer.resize(len);
        for(int i=0;i<len; i++)
            kmer[i] = seq->seq[pos+i];
        updateHash();
    };

    // Create Kmer directly from string
    Kmer(std::string const& seq) {
        kmer = seq;
        updateHash();
    };

    inline std::string getKmer() const {
        return kmer;
    }
};

struct Kmer_hash {
    inline unsigned int operator()(Kmer const& kmer) const {
        return kmer.hash;
    }
};

struct Kmer_cmp {
    inline bool operator()(Kmer const& kmer1, Kmer const& kmer2) const {
        if(kmer1.hash!=kmer2.hash) return false;
        return (kmer1.kmer==kmer2.kmer);
    }
};

struct Kmer_less {
    inline bool operator()(Kmer const& kmer1, Kmer const& kmer2) const {
        if(kmer1.hash<kmer2.hash) return true;
        if(kmer1.hash>kmer2.hash) return false;
        return (kmer1.kmer<kmer2.kmer);
    }    
};

inline int baseToInt(char base) {
    switch(base) {
        case 'A':
            return 0;
        case 'C':
            return 1;
        case 'G':
            return 2;
        case 'T':
            return 3;
        case 'N':
        default:
            // Anything else will be regarded as 'N'
            return 4;
    }
    return 0;
};

inline char intToBase(int i) {
    switch(i) {
        case 0:
            return 'A';
        case 1:
            return 'C';
        case 2:
            return 'G';
        case 3:
            return 'T';
        default:
            throw std::runtime_error("intToBase: argument must be in [0,3]");
    }
    return 0;
};

inline int intToBase(int i1, int i2) {
    if(i1==i2) return intToBase(i1);
    if(i1>i2) std::swap(i1, i2);
    if(i1==0) {
        if(i2==1) return 'M';
        if(i2==2) return 'R';
        if(i2==3) return 'W';
    } else if(i1==1) {
        if(i2==2) return 'S';
        if(i2==3) return 'Y';
    } else if(i1==2) {
        if(i2==3) return 'K';
    }
    return 0;
};

inline bool isAnyBase(int b){
    return (b>=4);
};

inline bool isAnyBase(char b){
    return (baseToInt(b)>=4);
};

#endif