File: GenomeInBits.cpp

package info (click to toggle)
perm 0.4.0-8
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 976 kB
  • sloc: cpp: 13,499; makefile: 98; sh: 12
file content (191 lines) | stat: -rw-r--r-- 7,802 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#include "GenomeInBits.h"

CGenomeInBits::CGenomeInBits(unsigned int uiGenomeSize)
{
    this->initialization(uiGenomeSize);
}

CGenomeInBits::~CGenomeInBits(void)
{
    delete [] this->pLowerBits;
    delete [] this->pUpperBits;
    delete this->pNBits;
    // don't delete this->pGenome.
}

CGenomeInBits::CGenomeInBits(CGenomeNTdata* pGenome)
{
    this->initialization(pGenome->iGenomeSize);
    this->pGenome = pGenome;
    // If there are non-ACGT characters in a chromosome, the corresponding bits in memory will be set to A (00).

    for (unsigned int chrId = 0; chrId < this->pGenome->iNo_of_chromosome; chrId++) {
        CchromosomeNTdata* chr = this->pGenome->paChromosomes[chrId];
        // (1) Encode the junction of each chromosome
        encodeJunction(chrId);

        // (2) Encode the middle part of the chromosome
        unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
        unsigned int blockId = (uiGenomeLoucs / wordSize) + 1; // start at the second encoding block for the chromosome
        uiGenomeLoucs = blockId * wordSize ; // The start genome Locus Id of the block
        unsigned int chrLocusId = this->pGenome->genomeLocusID2chrIndex(uiGenomeLoucs);
        unsigned int i;
        // i is the block local Id in the chromosome. blockId is the global bloack Id
        for (i = 1; chrLocusId + i * wordSize < chr->iChromosome_size; blockId++, i++) {
            encodeReadNasA(&(chr->caChromosome[chrLocusId + (i - 1) * wordSize]), wordSize,
                           &this->pUpperBits[blockId], &this->pLowerBits[blockId]);
        }
        // (3) Encode the tail of the chromosome (won't be empty)
        encodeReadNasA(&(chr->caChromosome[chrLocusId + (i - 1) * wordSize]), chr->iChromosome_size - (chrLocusId + (i - 1) * wordSize),
                       &this->pUpperBits[blockId], &this->pLowerBits[blockId]);
    }
    // (4) Encode bits N in a boolFlagArray
    unsigned int noOfNInGenome = 0;
    for (unsigned int chrId = 0; chrId < this->pGenome->iNo_of_chromosome; chrId++) {
        CchromosomeNTdata* chr = this->pGenome->paChromosomes[chrId];
        unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
        for (unsigned int i = 0; i < chr->iChromosome_size; i++, uiGenomeLoucs++) {
            if (!isACGT(chr->caChromosome[i])) {
                this->pNBits->setflag(uiGenomeLoucs, true);
                noOfNInGenome ++;
            }
        }
    }
    if ( noOfNInGenome > 0) {
        LOG_INFO("Info %d: There are %u N in the genome.\r", INFO_LOG, noOfNInGenome);
    }
}

int CGenomeInBits::initialization(unsigned int uiGenomeSize)
{
    this->pNBits = NULL;
    this->pLowerBits = NULL;
    this->pUpperBits = NULL;
    this->pGenome = NULL;
    caSubstring[0] = '\0';
    return (allocBitStrSpace(uiGenomeSize));
}

int CGenomeInBits::allocBitStrSpace(unsigned int uiGenomeSize)
{
    if (uiGenomeSize > 0) {
        if (this->pLowerBits == NULL && this->pUpperBits == NULL && this->pNBits == NULL) {
            this->uiGenomeLength = uiGenomeSize;
            this->uiGenomeLengthInWordSize = (uiGenomeLength - 1) / wordSize + 2; // add one more
            this->pLowerBits = new WORD_SIZE[uiGenomeLengthInWordSize];
            this->pUpperBits = new WORD_SIZE[uiGenomeLengthInWordSize];
            this->pNBits = new CboolFlagArray(uiGenomeLength);
            memset(pLowerBits, 0x00, uiGenomeLengthInWordSize);
            memset(pUpperBits, 0x00, uiGenomeLengthInWordSize);
        } else {
            ERR;
        }
    }
    return(0);
}

int CGenomeInBits::encodeJunction(unsigned int chrId)
{
    char region[wordSize + 1];
    unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
    unsigned int blockId = uiGenomeLoucs / wordSize;

    if (chrId == 0) {
        strncpy(region, this->pGenome->paChromosomes[chrId]->caChromosome, wordSize);
    } else {
        unsigned int tailLength = uiGenomeLoucs % wordSize; // tail of the previous chromosome in the junction block
        CchromosomeNTdata* pChr;
        pChr = this->pGenome->paChromosomes[chrId - 1];
        strncpy(region, &(pChr->caChromosome[pChr->iChromosome_size - tailLength]), tailLength);
        pChr = this->pGenome->paChromosomes[chrId];
        strncpy(&region[tailLength], pChr->caChromosome, wordSize - tailLength);
    }
    region[wordSize] = '\0';
    encodeReadNasA(region, wordSize, &this->pUpperBits[blockId], &this->pLowerBits[blockId]);
    return(0);
}

/* Note the chromosome encoding is NOT continuous. The more significant bits of each words encodes
 * nucleotides in front of those less significant bits. This encodeing make bits incontinuously mapped
 * to chromosome index. Note the first bit of each WORD represent the first base of each section
 */
CReadInBits CGenomeInBits::getSubstringInBits(unsigned int uiGenomeIndex) const
{
    CReadInBits r;
    unsigned int indexInWords = uiGenomeIndex / wordSize;
    unsigned int bitsShift = uiGenomeIndex % wordSize;
    if (this->uiGenomeLengthInWordSize > indexInWords) {
        r.UpperBits = this->pUpperBits[indexInWords] >> bitsShift;
        r.LowerBits = this->pLowerBits[indexInWords] >> bitsShift;
        if (bitsShift != 0) {
            r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShift));
            r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShift));
        }
    } else
        LOG_INFO("\nInfo %d: wrong genome index.\n", WARNING_LOG);
    return(r);
}

// eliminate the tail bits out of read length range
CReadInBits CGenomeInBits::getSubstringInBits\
(unsigned int uiGenomeIndex, unsigned int uiSubstringLength) const
{
    CReadInBits r;
    if(uiGenomeIndex < this->uiGenomeLength) {
        unsigned int indexInWords = uiGenomeIndex / wordSize;
        unsigned int bitsShift = uiGenomeIndex % wordSize;
        r.UpperBits = this->pUpperBits[indexInWords] >> bitsShift;
        r.LowerBits = this->pLowerBits[indexInWords] >> bitsShift;
        if (bitsShift != 0) {
            r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShift));
            r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShift));
        }
        unsigned int elimatedBitsNo = wordSize - uiSubstringLength;
        r.UpperBits <<= elimatedBitsNo;
        r.LowerBits <<= elimatedBitsNo;
        r.UpperBits >>= elimatedBitsNo;
        r.LowerBits >>= elimatedBitsNo;
    } else {
        cout << "Access outside of the genomeInBits" << endl;
    }
    return (r);
}

char* CGenomeInBits::getSubstring(unsigned int uiGenomeIndex)
{
    CReadInBits r = getSubstringInBits(uiGenomeIndex);
    decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits);
    return (caSubstring);
}

char* CGenomeInBits::getSubstring(unsigned int uiGenomeIndex, unsigned int uiSubstringLength)
{
    CReadInBits r = getSubstringInBits(uiGenomeIndex);
    decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits);
    if (uiSubstringLength <= wordSize) {
        caSubstring[uiSubstringLength] = '\0';
    }

    if (uiGenomeIndex +  uiSubstringLength > uiGenomeLength) {
        if (uiGenomeLength > uiGenomeIndex)
            caSubstring[uiGenomeLength - uiGenomeIndex] = '\0';
        else {
            LOG_INFO("\nInfo %d: wrong genome index.\n", WARNING_LOG);
            caSubstring[0] = '\0';
        }
    }
    return (caSubstring);
}

bool CGenomeInBits::fragACGTKmerInBits(CReadInBits& kmerInBits, int startIndex, int kmerLength)
{
    for (int i = 0; i < kmerLength; i++) {
        int baseGenomeIndex = i + startIndex;
        if (this->pNBits->b(baseGenomeIndex)) { // Meet N
            return(false);
        }
    }
    kmerInBits = getSubstringInBits(startIndex, kmerLength);
    return(true);
}