File: ReadsQualScores.cpp

package info (click to toggle)
perm 0.4.0-8
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 976 kB
  • sloc: cpp: 13,499; makefile: 98; sh: 12
file content (207 lines) | stat: -rw-r--r-- 7,641 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#include "ReadsQualScores.h"

CReadsQualScores::CReadsQualScores(void)
{
    initialization(0, 0);
}

CReadsQualScores::CReadsQualScores(unsigned int readLength, unsigned int numOfReads)
{
    this->initialization(readLength, numOfReads);
}

CReadsQualScores::~CReadsQualScores(void)
{
    delete [] this->QSarray;
    this->QSarray = NULL;
}

void CReadsQualScores::clear(void)
{
    this->load = 0;
}

void CReadsQualScores::reserve(unsigned int numOfReads)
{
    if (numOfReads > this->numOfReads) {
        this->numOfReads = numOfReads;
        if (this->QSarray != NULL) {
            delete [] this->QSarray; // all stored quality score will be gone
            this->QSarray = NULL;
        }
        this->size = readLength *  numOfReads;
        this->QSarray = new char[this->size];
        memset(this->QSarray, '\0', this->size);
    }
}

void CReadsQualScores::initialization(unsigned int readLength, unsigned int numOfReads)
{
    this->size = 0; // capacity in bases
    this->QSarray = NULL;
    this->scoreType = 'I';
    this->load = 0; // how many reads' qualities score have been load
    this->numOfReads = 0; // must initialize to 0 and reserve
    this->readLength = readLength;
    this->reserve(numOfReads);
}

/*
 * This function open an QUAL file and get the first pReadsID->size quality scores
 */
bool CReadsQualScores::openQUALfile(const char* Filename)
{
    if (fileExist(Filename)) {
        if (hasTheExtName(Filename, ".qual") ||
                hasTheExtName(Filename, ".QUAL")) {
            const unsigned int FILE_INPUT_BUFFER_SIZE = 1000000;
            this->scoreType = 'S';
            this->ifile.open(Filename);
            this->IBuf.initialize(FILE_INPUT_BUFFER_SIZE, &ifile);
            return(true);
        }
    }
    return(false);
}

unsigned int CReadsQualScores::getQualityScoresFromQUAL(vector<CReadID>* pReadsID)
{
    char caBuffer[MAX_LINE];
    char caReadTag[MAX_LINE];
    caReadTag[0] = '\0';

    while (this->load < (unsigned int)pReadsID->size()) {
        unsigned int qsIndex = this->load  * this->readLength;
        if (this->load >= numOfReads) {
            const char* theReadId = pReadsID->at(this->load).id;
            LOG_INFO("Info %d: Read %s has no quality.\n", WARNING_LOG, theReadId);
            break;
        }
        caBuffer[0] = '\0';
        if (this->IBuf.Getline(caBuffer, MAX_LINE) == 0) {
            break; // Note this->pBuf->Getline() will return 0 if EOF
        } else {
            //If this line is header, new line, comment or null line however not EOF, read the next line
            if (caBuffer[0] == '>' && ifile.eof() == false) {
                myStrCpy(caReadTag, &(caBuffer[1]), READ_ID_LENGTH - 2);
                formatReadId(caReadTag);
            } else if ((caBuffer[0] == ' ' || caBuffer[0] == '\n' ||
                        caBuffer[0] == '#' || caBuffer[0] == '\0') && ifile.eof() == false) {
                continue;
            } else { // No warning for the case that the number of quality score and the read length are not matched.
                // check the tag to see if it is the correct place
                int iReadId = alignReadId4QScores(caReadTag, this->load, pReadsID);
                if (iReadId < 0) {
                    continue; // bad read, skip its quality
                } else {
                    this->load = (unsigned int)iReadId;
                }
                int bufferIndex = 0;
                for (unsigned int pos = 0; pos < this->readLength; pos++) { //read the quality score for each base of the read
                    this->QSarray[qsIndex + pos] = (char)atoi(&caBuffer[bufferIndex]);
                    for (; caBuffer[bufferIndex] != ' '; bufferIndex++) {
                        if (caBuffer[bufferIndex] == '\0') break;
                    };
                    if (caBuffer[bufferIndex] == '\0') break;
                    bufferIndex++; // mov to the start of next qscore in the buffer
                }
                this->load ++; // get qs for one more reads
            }
        }
    }
    if (this->load < (unsigned int)pReadsID->size()) {
        LOG_INFO("Info %d: Not every read has quality score.\n", WARNING_LOG);
        for (unsigned int i = this->load; i < (unsigned int)pReadsID->size(); i++) {
            cout << "Ex:" << pReadsID->at(i).id << " doesn't get QS" << endl;
            break; // show only one
        }
        LOG_INFO("Info %d: Try arg --delimiter <Symbol> to get the correct read Id.\n", INFO_LOG);
    }
    return(this->load);
}

int CReadsQualScores::alignReadId4QScores(char* tag, unsigned int searchPoint, vector<CReadID>* pReadsID)
{
    if (pReadsID == NULL) {
        LOG_INFO("Info %d: No read Id is available.\n", WARNING_LOG);
    } else if (searchPoint >= (pReadsID->size())) {
        LOG_INFO("Info %d: Request quality scores over the read size %u.\n",\
                 WARNING_LOG, (unsigned int)pReadsID->size());
        return(searchPoint);
    } else  { // Currently, only check the previous and the next tag if the current one is not matched.
        char* expectedTag = pReadsID->at(searchPoint).id;
        if (strcmp(expectedTag, tag) == 0) {
            return(searchPoint); // read ID correctly aligned
        }
        // align to next tag
        if (pReadsID->size() > searchPoint + 1) {
            expectedTag = pReadsID->at(searchPoint + 1).id;
            if (strcmp(expectedTag, tag) == 0) {
                LOG_INFO("\nInfo %d: Read %s has no quality score.\n",\
                         WARNING_LOG, pReadsID->at(searchPoint).id);
                return(searchPoint + 1);
            }
        }
        // aligned to previous tag
        if (pReadsID->size() > 0 && searchPoint > 0) {
            expectedTag = pReadsID->at(searchPoint - 1).id;
            if (strcmp(expectedTag, tag) == 0) {
                LOG_INFO("\nInfo %d: Extra quality scores for read %d.\n",\
                         WARNING_LOG, searchPoint - 1);
                return(searchPoint - 1);
            }
        }
        // cout << "Bad read " << tag << endl;
        // STRIKE_KEY2CONTINUE;
    }
    return(-1); // bad read with N, should skip its quality.
}

void trQScores(unsigned int readLength, char qShift, const char* oldQSs, char* newQSs)
{
    unsigned int i;
    for (i = 0; i < readLength; i++) {
        newQSs[i] = oldQSs[i] + qShift;
    }
    newQSs[i] = '\0';
}

double getAverageQualityScores(CReadsQualScores& scores)
{
    double sumOfQulityScore = 0;
    for (unsigned int i = 0; i < scores.load; i++) {
        for (unsigned int j = 0; j < scores.readLength; j++) {
            sumOfQulityScore += (double)scores.qs(i, j);
        }
    }
    return(sumOfQulityScore / (double)(scores.readLength * scores.load));
}

int alignmentScore(char* str1, char* str2, unsigned int readLength, const char* sc)
{
    int score = 0;
    for (unsigned int i = 0; i < readLength; i++) {
        // sum of mismatched scores
        if (str1[i] != str2[i]) {
            int baseQuality = (int)sc[i];
            if ( baseQuality > 0) {
                score += baseQuality;
            }
        }
    }
    return(score);
}

void printCommaSepScoresStr(unsigned int readlength, const char* qScores, char* qScoresStr)
{
    char scoreStr[MAX_LINE];
    int scoreStrLength = 0;
    unsigned int i = 0, j = 0;
    for (i = 0, j = 0; i < readlength; i++, j+=scoreStrLength) {
        sprintf(scoreStr, "%d,", (int)qScores[i]);
        scoreStrLength = (int)strlen(scoreStr);
        sprintf(&(qScoresStr[j]), "%s", scoreStr);
    }
    if (j > 0) j--; // removed the last ','
    qScoresStr[j] = '\0';
}