File: ColorSpaceRead.h

package info (click to toggle)
perm 0.4.0-8
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 976 kB
  • sloc: cpp: 13,499; makefile: 98; sh: 12
file content (295 lines) | stat: -rw-r--r-- 11,058 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#pragma once
#ifndef COLOR_SPACE_READ_H
#define COLOR_SPACE_READ_H

#include "ReadInBits.h"
#include "ShortReadUtil.h"
#include "stdafx.h"

/*
 * This file provides functions translated from base space to color space and vice versa
 * encoded class CReadInBits.
 */

// The first base is encoded in first bit returned CReadInBits
CReadInBits colors2Bases(CReadInBits readInColors);

// Translate color read in B0123 format to CReadInBits
bool encodeColors(const char* colorsStr, CReadInBits&  readInColors);
bool encodeColorsNas3(const char* colorsStr, CReadInBits&  readInColors);
// Translate color read in readInBits format to c_str in B0123 format
char* decodeColors(char* colorsStr, CReadInBits readInColors);
char* decodePureColors(char* colorsStr, CReadInBits readInColors);
char* decodeLongColors(char* colorsStr, CReadInBits readInColors1stHalf, CReadInBits readInColors2ndHalf, bool oddRead);

inline char* decodeColorReadWithPrimer(char* caRead, CReadInBits readInColors)
{
    decodeColors(&(caRead[1]), readInColors);
    caRead[0] = caRead[1]; // Mimic the first base from the primer
    caRead[1] = '0';
    return(caRead);
}

char* correctAndDecodeRead \
(CReadInBits readInColors, CReadInBits refInColors, bool correct, char* caRead, char* caQscore);

const static char color_transfer_matrix[][4] = {
    { '0', '1' , '2', '3' },
    { '1', '0' , '3', '2' },
    { '2', '3', '0', '1' },
    { '3', '2', '1', '0' }
};

inline void basesStr2ColorStr(const char* readInBase, char* readInColor)
{
    readInColor[0] = readInBase[0];
    readInColor[1] = '0';
    for (unsigned int i = 1; i < MAX_READ_LENGTH * 2; i++)  {
        char b = readInBase[i - 1];
        char c = readInBase[i];
        if (c == '\0') {
            readInColor[i + 1] = '\0'; // TODO Be carefule about the input without '\0';
            break;
        } else {
            readInColor[i + 1] = color_transfer_matrix[nt2Id(b)][nt2Id(c)];
        }
    }// TODO Be carefule about the length 128bp input
}


// Encode the first base in the first bit
inline CReadInBits bases2Colors(CReadInBits readInBase)
{
    CReadInBits readInColor;
    //Note the first bits store the first bases of the read. The following bits are color transition
    readInColor.UpperBits = readInBase.UpperBits ^(readInBase.UpperBits << 1);
    readInColor.LowerBits = readInBase.LowerBits ^(readInBase.LowerBits << 1);
    //Note the most significant digit is useless
    return(readInColor);
}

// Don't encode the first base in the first bit, colors only
inline CReadInBits bases2PureColors(CReadInBits readInBase)
{
    CReadInBits readInColor;
    //Note  bits are color transitions
    readInColor.UpperBits = readInBase.UpperBits ^(readInBase.UpperBits >> 1);
    readInColor.LowerBits = readInBase.LowerBits ^(readInBase.LowerBits >> 1);
    //Note the most significant digit is useless
    return(readInColor);
}

// reverse colors signals. (The reverse compliment of a read
inline CReadInBits reversePureColors(CReadInBits readInPureColors, unsigned int colorsLength)
{
    CReadInBits reverseColors;
    // Note the first bit (least significant) store the first bases of the read. The following bits are color transition
    unsigned int tailLength = (wordSize - colorsLength);
    reverseColors.UpperBits = (reverse64bits(readInPureColors.UpperBits) >> tailLength);
    reverseColors.LowerBits = (reverse64bits(readInPureColors.LowerBits) >> tailLength);
    //Note the most significant digit is useless
    return(reverseColors);
}

// Encode the first base in the first bit (least significant)
inline void longBases2Colors(CReadInBits& readInBase1stHalf, CReadInBits& readInBase2ndHalf, CReadInBits& readInColor1stHalf, CReadInBits& readInColor2ndHalf, bool oddReadLength)
{
    readInColor1stHalf = bases2Colors(readInBase1stHalf);
    readInColor2ndHalf = bases2Colors(readInBase2ndHalf);
    // TO set the transition color bit between the two part of the read
    // Get the last bit of first half
    // To get the first bit of the second half
    unsigned long long upperBit1, lowerBit1, upperBit2, lowerBit2, transitionUpperBit, transitionLowerBit;
    unsigned int shiftPos;
    if (oddReadLength) {
        shiftPos = (unsigned int)CReadInBits::iReadLength - 0x02;
    } else {
        shiftPos = (unsigned int)CReadInBits::iReadLength - 0x01;
    }
    upperBit1 = longlongShiftRight(readInBase1stHalf.UpperBits, shiftPos) & 0x01;
    lowerBit1 = longlongShiftRight(readInBase1stHalf.LowerBits, shiftPos) & 0x01;

    upperBit2 = readInBase2ndHalf.UpperBits & 0x01;
    lowerBit2 = readInBase2ndHalf.LowerBits & 0x01;

    transitionUpperBit = (upperBit1 ^ upperBit2) & 0x01;
    transitionLowerBit = (lowerBit1 ^ lowerBit2) & 0x01;
    // set the first bit of readInColor2ndHalf as the transition bit
    readInColor2ndHalf.UpperBits >>= 0x01;
    readInColor2ndHalf.UpperBits <<= 0x01;
    if (transitionUpperBit > 0) {
        readInColor2ndHalf.UpperBits += 0x01;
    }
    readInColor2ndHalf.LowerBits >>= 0x01;
    readInColor2ndHalf.LowerBits <<= 0x01;
    if (transitionLowerBit > 0) {
        readInColor2ndHalf.LowerBits += 0x01;
    }
}

// Correct the single mismatched from the read in color.
// SNP_FLAG is a bits string as flag that should take from the readInColors (SNP caused color mismatches)
inline CReadInBits correctReadInColorSpace(CReadInBits readInColors, CReadInBits refInColors, WORD_SIZE SNP_FLAG)
{
    CReadInBits correctedReadInColors;
    correctedReadInColors.UpperBits = (readInColors.UpperBits & SNP_FLAG) | (refInColors.UpperBits & ~SNP_FLAG);
    correctedReadInColors.LowerBits = (readInColors.LowerBits & SNP_FLAG) | (refInColors.LowerBits & ~SNP_FLAG) ;
    return(correctedReadInColors);
}

// use a matrix for translate a base and color to the next base
// translate a base and color to the next base
const static char base_transfer_matrix[][4] = {
    { 'A', 'C' , 'G', 'T' },
    { 'C', 'A' , 'T', 'G' },
    { 'G', 'T', 'A', 'C' },
    { 'T', 'G', 'C', 'A' }
};
inline char base2Color(char base, char color)
{
    switch (base) {
    case 'A':
    case 'a':
        return (base_transfer_matrix[0][color - '0']);
    case 'C':
    case 'c':
        return (base_transfer_matrix[1][color - '0']);
    case 'G':
    case 'g':
        return (base_transfer_matrix[2][color - '0']);
    case 'T':
    case 't':
        return (base_transfer_matrix[3][color - '0']);
    default:
        return(0);
    }
}

// If there are two consecutive mismatched colors, check if it is a valid SNP.
// Return # of mismatches or -1 -2 -3 for valid to represent Complement, Transversion or Transition.
int getSNPtype(CReadInBits readInColors, CReadInBits refInColors);
//  Translate the -1 -2 -3 to a character as flag
inline char returnSNPtype(CReadInBits readInColors, CReadInBits refInColors)
{
    int diffOrSNPFLag = getSNPtype(readInColors, refInColors);
    switch (diffOrSNPFLag) {
    case -1:
        return('C');
    case -2:
        return('V');
    case -3:
        return('T');
    default:
        return('N');
    }
}

inline int returnSNPtype(char c)
{
    switch (c) {
    case 'C':
        return(1);
    case 'V':
        return(2);
    case 'T':
        return(3);
    default:
        return(0);
    }
}

// correct the single mismatches in the
int correctReadInColorSpace(CReadInBits readInColors, CReadInBits refInColors, CReadInBits& correctedRead);
void colorQV2baseQV(CReadInBits readInColors, CReadInBits& correctedRead, char* Qscores);
bool colorQV2baseQV(WORD_SIZE singleColorErrorflag, char* Qscores, unsigned int readLength);

inline bool setFirstBase(char c, CReadInBits& readInBase)
{
    switch (c) {
    case 'a':
    case 'A':
    case '0':
        readInBase.UpperBits &= (~0x01);
        readInBase.LowerBits &= (~0x01);
        return(true);
    case 'c':
    case 'C':
    case '1':
        readInBase.UpperBits &= (~0x01);
        readInBase.LowerBits |= 0x01;
        return(true);
    case 'g':
    case 'G':
    case '2':
        readInBase.UpperBits |= 0x01;
        readInBase.LowerBits &= (~0x01);
        return(true);
    case 't':
    case 'T':
    case '3':
        readInBase.UpperBits |= 0x01;
        readInBase.LowerBits |= 0x01;
        return(true);
    case 'N':
        // The first base N' keep the original as random
        return(false);
    default:
        if (isprint(c)) {
            cout << "Unknown character " << c << " in the first base of read." << endl;
        } else {
            cout << "Unknown character with ascii" << (int)c << " in the first base of read." << endl;
        }
        return(false);
    }
}

inline CReadInBits reverseColorRead(CReadInBits readInColors)
{
    // Leave one more bit to put the end base bit
    const int shiftBitsAfterReverse = wordSize - CReadInBits::iReadLength - 1;
    CReadInBits reverseColors = reverseBitsSignals(readInColors, shiftBitsAfterReverse);
    // Fix the first base is not correct after the reverse
    CReadInBits readInBases = colors2Bases(readInColors);

    bool lowerLastBaseBit = isKthBitSet(readInBases.LowerBits, CReadInBits::iReadLength - 1);
    bool upperLastBaseBit = isKthBitSet(readInBases.UpperBits, CReadInBits::iReadLength - 1);
    setKthBit(reverseColors.LowerBits, 0, lowerLastBaseBit);
    setKthBit(reverseColors.UpperBits, 0, upperLastBaseBit);
    return(reverseColors);
}

inline void reverseLongColorRead(CReadInBits& colorRead1stHalf, CReadInBits& colorRead2ndHalf, bool oddReadLength)
{
    // Fix the first base is not correct after the reverse
    /*
    CReadInBits readInBases = colors2Bases(readInColors);
    if(oddReadLength) {
        const int shiftBitsAfterReverse1 = wordSize - CReadInBits::iReadLength;
        CReadInBits revColorRead2 = reverseBitsSignals(colorRead1stHalf, shiftBitsAfterReverse);
        const int shiftBitsAfterReverse2 = wordSize - CReadInBits::iReadLength - 1;
        CReadInBits revColorRead1 = reverseBitsSignals(colorRead2ndHalf, shiftBitsAfterReverse);
    } else {
        const int shiftBitsAfterReverse = wordSize - CReadInBits::iReadLength - 1;
        CReadInBits revColorRead2 = reverseBitsSignals(colorRead1stHalf, shiftBitsAfterReverse);
        CReadInBits revColorRead1 = reverseBitsSignals(colorRead2ndHalf, shiftBitsAfterReverse);
    }
    */
}

char* decodeLongColors(char* colorsStr, CReadInBits readInColors1stHalf, CReadInBits readInColors2ndHalf, bool oddReadLength);

void testShift64Bit(void);
void testLongBases2ColorsCases(void);
void testLongBases2Colors(const char* longRead, const char* expLongColorSignals);
void testReverseColorSignals(const char* colorSignalStr);

void assertSNP(int SNPType, CReadInBits refInColors, CReadInBits crInColors);

// Given strings in bases, return the corresponding color signal in A=0 C=1 G=2, T=3 Format
string readInBases2ColorsInACGT_Format(string readInBases);
// Given color Read in ACGT Format, return correspond string in A=0 C=1 G=2, T=3 Format
// Note the first base is duplicated.
string colorReadInACGTto0123Format(string colorReadInACGT);

#endif /* COLOR_SPACE_READ_H */