File: AlignedSequenceLoader.cxx

package info (click to toggle)
arb 6.0.6-8
  • links: PTS, VCS
  • area: non-free
  • in suites: sid, trixie
  • size: 66,204 kB
  • sloc: ansic: 394,911; cpp: 250,290; makefile: 19,644; sh: 15,879; perl: 10,473; fortran: 6,019; ruby: 683; xml: 503; python: 53; awk: 32
file content (199 lines) | stat: -rw-r--r-- 5,307 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/*
 * AlignedSequenceLoader.cxx
 *
 *  Created on: Feb 15, 2010
 *      Author: Breno Faria
 *
 *  Institute of Microbiology (Technical University Munich)
 *  http://www.arb-home.de/ 
 */


#include "AlignedSequenceLoader.h"

#include <arbdb.h>
#include <arbdbt.h>
#include "dbconn.h"

/**
 * Loads the marked sequences aligned from Arb's DB.
 * This loader only considers letters given by the following regular
 * expression: [ACGTUacgtu]
 */
AlignedSequenceLoader::AlignedSequenceLoader() {

    GBDATA *gb_main = runningDatabase();

    GB_ERROR error = GB_push_transaction(gb_main);
    if (error) {
        cout << "RNACMA-Error: " << error << "\n";
        exit(EXIT_FAILURE);
    }

    seqs = new VecVecType(0);

    char *al_name = GBT_get_default_alignment(gb_main);
    MSA_len = GBT_get_alignment_len(gb_main, al_name);

    size_t occurrences[MSA_len];
    for (size_t i = 0; i < MSA_len; i++)
        occurrences[i] = 0;

    cout << "loading marked species: ";
    flush( cout);
    for (GBDATA *gb_species = GBT_first_marked_species(gb_main); gb_species; gb_species
             = GBT_next_marked_species(gb_species)) {

        GBDATA *gb_ali = GB_entry(gb_species, al_name);

        if (gb_ali) { // existing alignment for this species
            GBDATA *gb_data = GB_entry(gb_ali, "data");

            if (gb_data) {

                cout << GBT_read_name(gb_species) << " ";
                flush(cout);

                string sequence = GB_read_string(gb_data);

                string *seq_as_vec = new string[MSA_len];
                int     k          = 0;
                for (string::iterator i = sequence.begin(); i != sequence.end(); ++i) {
                    switch (*i) {
                        case 'A':
                        case 'a':
                            seq_as_vec[k] = "A";
                            occurrences[k] += 1;
                            break;
                        case 'C':
                        case 'c':
                            seq_as_vec[k] = "C";
                            occurrences[k] += 1;
                            break;
                        case 'G':
                        case 'g':
                            seq_as_vec[k] = "G";
                            occurrences[k] += 1;
                            break;
                        case 'T':
                        case 't':
                        case 'U':
                        case 'u':
                            seq_as_vec[k] = "T";
                            occurrences[k] += 1;
                            break;
                        default:
                            seq_as_vec[k] = "-";
                            break;
                    }
                    k++;
                }

                assert((size_t)k == MSA_len);
                vector<string> seq_vector(&seq_as_vec[0], &seq_as_vec[k]);
                delete [] seq_as_vec;

                seqs->push_back(seq_vector);
            }
        }

    }

    GB_pop_transaction(gb_main);

    cout << "done. Total number of species: " << seqs->size() << endl;
    flush(cout);

    cleanSeqs(occurrences, MSA_len);
}

/**
 * Returns the aligned seqs.
 *
 * @return the aligned seqs.
 */
VecVecType* AlignedSequenceLoader::getSequences() {
    return seqs;
}

/**
 * Destructor.
 */
AlignedSequenceLoader::~AlignedSequenceLoader() {
    delete seqs;
}

/**
 * Getter for the MSA_len.
 *
 * @return the MSA length.
 */
size_t AlignedSequenceLoader::getMsaLen() {
    return MSA_len;
}

/**
 * Getter for the position map. The position map maps positions in the clean
 * sequence to the original positions in the alignment.
 *
 * @return the position map.
 */
vector<size_t> * AlignedSequenceLoader::getPositionMap() {
    return position_map;
}

/**
 * This method cleans-up the empty positions of the MSA.
 *
 *
 * @param occurrences: a list gathering the number of occurrences of bases at
 *                     each position of the MSA.
 * @param len: the length of occurrences.
 */
void AlignedSequenceLoader::cleanSeqs(size_t *occurrences, long len) {

    cout << "cleaning-up sequences of empty positions... " << endl;
    flush( cout);

    size_t num_of_bases = 0;
    for (int i = 0; i < len; i++) {
        if (occurrences[i] != 0) {
            num_of_bases++;
        }
    }

    cout << "number of non-empty positions in MSA: " << num_of_bases
         << ". Filtered out " << len - num_of_bases << " positions." << endl;

    VecVecType *clean_seqs = new VecVecType(0);

    cout << "computing position map..." << endl;
    position_map = new vector<size_t> (num_of_bases, 0);
    int j = 0;
    for (int i = 0; i < len; i++) {
        if (occurrences[i] != 0) {
            position_map->at(j) = i;
            j++;
        }
    }

    for (VecVecType::iterator seq = seqs->begin(); seq != seqs->end(); ++seq) {
        //for every sequence
        vector<string> sequence(num_of_bases, "");
        int jj = 0;
        for (int i = 0; i < len; ++i) {
            if (occurrences[i] != 0) {
                sequence.at(jj) = seq->at(i);
                jj++;
            }
        }
        assert(sequence.size() == num_of_bases);
        clean_seqs->push_back(sequence);
    }

    seqs = clean_seqs;

    cout << "clean-up done." << endl;

}