1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
|
#include "GenomeInBits.h"
CGenomeInBits::CGenomeInBits(unsigned int uiGenomeSize)
{
this->initialization(uiGenomeSize);
}
CGenomeInBits::~CGenomeInBits(void)
{
delete [] this->pLowerBits;
delete [] this->pUpperBits;
delete this->pNBits;
// don't delete this->pGenome.
}
CGenomeInBits::CGenomeInBits(CGenomeNTdata* pGenome)
{
this->initialization(pGenome->iGenomeSize);
this->pGenome = pGenome;
// If there are non-ACGT characters in a chromosome, the corresponding bits in memory will be set to A (00).
for (unsigned int chrId = 0; chrId < this->pGenome->iNo_of_chromosome; chrId++) {
CchromosomeNTdata* chr = this->pGenome->paChromosomes[chrId];
// (1) Encode the junction of each chromosome
encodeJunction(chrId);
// (2) Encode the middle part of the chromosome
unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
unsigned int blockId = (uiGenomeLoucs / wordSize) + 1; // start at the second encoding block for the chromosome
uiGenomeLoucs = blockId * wordSize ; // The start genome Locus Id of the block
unsigned int chrLocusId = this->pGenome->genomeLocusID2chrIndex(uiGenomeLoucs);
unsigned int i;
// i is the block local Id in the chromosome. blockId is the global bloack Id
for (i = 1; chrLocusId + i * wordSize < chr->iChromosome_size; blockId++, i++) {
encodeReadNasA(&(chr->caChromosome[chrLocusId + (i - 1) * wordSize]), wordSize,
&this->pUpperBits[blockId], &this->pLowerBits[blockId]);
}
// (3) Encode the tail of the chromosome (won't be empty)
encodeReadNasA(&(chr->caChromosome[chrLocusId + (i - 1) * wordSize]), chr->iChromosome_size - (chrLocusId + (i - 1) * wordSize),
&this->pUpperBits[blockId], &this->pLowerBits[blockId]);
}
// (4) Encode bits N in a boolFlagArray
unsigned int noOfNInGenome = 0;
for (unsigned int chrId = 0; chrId < this->pGenome->iNo_of_chromosome; chrId++) {
CchromosomeNTdata* chr = this->pGenome->paChromosomes[chrId];
unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
for (unsigned int i = 0; i < chr->iChromosome_size; i++, uiGenomeLoucs++) {
if (!isACGT(chr->caChromosome[i])) {
this->pNBits->setflag(uiGenomeLoucs, true);
noOfNInGenome ++;
}
}
}
if ( noOfNInGenome > 0) {
LOG_INFO("Info %d: There are %u N in the genome.\r", INFO_LOG, noOfNInGenome);
}
}
int CGenomeInBits::initialization(unsigned int uiGenomeSize)
{
this->pNBits = NULL;
this->pLowerBits = NULL;
this->pUpperBits = NULL;
this->pGenome = NULL;
caSubstring[0] = '\0';
return (allocBitStrSpace(uiGenomeSize));
}
int CGenomeInBits::allocBitStrSpace(unsigned int uiGenomeSize)
{
if (uiGenomeSize > 0) {
if (this->pLowerBits == NULL && this->pUpperBits == NULL && this->pNBits == NULL) {
this->uiGenomeLength = uiGenomeSize;
this->uiGenomeLengthInWordSize = (uiGenomeLength - 1) / wordSize + 2; // add one more
this->pLowerBits = new WORD_SIZE[uiGenomeLengthInWordSize];
this->pUpperBits = new WORD_SIZE[uiGenomeLengthInWordSize];
this->pNBits = new CboolFlagArray(uiGenomeLength);
memset(pLowerBits, 0x00, uiGenomeLengthInWordSize);
memset(pUpperBits, 0x00, uiGenomeLengthInWordSize);
} else {
ERR;
}
}
return(0);
}
int CGenomeInBits::encodeJunction(unsigned int chrId)
{
char region[wordSize + 1];
unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
unsigned int blockId = uiGenomeLoucs / wordSize;
if (chrId == 0) {
strncpy(region, this->pGenome->paChromosomes[chrId]->caChromosome, wordSize);
} else {
unsigned int tailLength = uiGenomeLoucs % wordSize; // tail of the previous chromosome in the junction block
CchromosomeNTdata* pChr;
pChr = this->pGenome->paChromosomes[chrId - 1];
strncpy(region, &(pChr->caChromosome[pChr->iChromosome_size - tailLength]), tailLength);
pChr = this->pGenome->paChromosomes[chrId];
strncpy(®ion[tailLength], pChr->caChromosome, wordSize - tailLength);
}
region[wordSize] = '\0';
encodeReadNasA(region, wordSize, &this->pUpperBits[blockId], &this->pLowerBits[blockId]);
return(0);
}
/* Note the chromosome encoding is NOT continuous. The more significant bits of each words encodes
* nucleotides in front of those less significant bits. This encodeing make bits incontinuously mapped
* to chromosome index. Note the first bit of each WORD represent the first base of each section
*/
CReadInBits CGenomeInBits::getSubstringInBits(unsigned int uiGenomeIndex) const
{
CReadInBits r;
unsigned int indexInWords = uiGenomeIndex / wordSize;
unsigned int bitsShift = uiGenomeIndex % wordSize;
if (this->uiGenomeLengthInWordSize > indexInWords) {
r.UpperBits = this->pUpperBits[indexInWords] >> bitsShift;
r.LowerBits = this->pLowerBits[indexInWords] >> bitsShift;
if (bitsShift != 0) {
r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShift));
r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShift));
}
} else
LOG_INFO("\nInfo %d: wrong genome index.\n", WARNING_LOG);
return(r);
}
// eliminate the tail bits out of read length range
CReadInBits CGenomeInBits::getSubstringInBits\
(unsigned int uiGenomeIndex, unsigned int uiSubstringLength) const
{
CReadInBits r;
if(uiGenomeIndex < this->uiGenomeLength) {
unsigned int indexInWords = uiGenomeIndex / wordSize;
unsigned int bitsShift = uiGenomeIndex % wordSize;
r.UpperBits = this->pUpperBits[indexInWords] >> bitsShift;
r.LowerBits = this->pLowerBits[indexInWords] >> bitsShift;
if (bitsShift != 0) {
r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShift));
r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShift));
}
unsigned int elimatedBitsNo = wordSize - uiSubstringLength;
r.UpperBits <<= elimatedBitsNo;
r.LowerBits <<= elimatedBitsNo;
r.UpperBits >>= elimatedBitsNo;
r.LowerBits >>= elimatedBitsNo;
} else {
cout << "Access outside of the genomeInBits" << endl;
}
return (r);
}
char* CGenomeInBits::getSubstring(unsigned int uiGenomeIndex)
{
CReadInBits r = getSubstringInBits(uiGenomeIndex);
decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits);
return (caSubstring);
}
char* CGenomeInBits::getSubstring(unsigned int uiGenomeIndex, unsigned int uiSubstringLength)
{
CReadInBits r = getSubstringInBits(uiGenomeIndex);
decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits);
if (uiSubstringLength <= wordSize) {
caSubstring[uiSubstringLength] = '\0';
}
if (uiGenomeIndex + uiSubstringLength > uiGenomeLength) {
if (uiGenomeLength > uiGenomeIndex)
caSubstring[uiGenomeLength - uiGenomeIndex] = '\0';
else {
LOG_INFO("\nInfo %d: wrong genome index.\n", WARNING_LOG);
caSubstring[0] = '\0';
}
}
return (caSubstring);
}
bool CGenomeInBits::fragACGTKmerInBits(CReadInBits& kmerInBits, int startIndex, int kmerLength)
{
for (int i = 0; i < kmerLength; i++) {
int baseGenomeIndex = i + startIndex;
if (this->pNBits->b(baseGenomeIndex)) { // Meet N
return(false);
}
}
kmerInBits = getSubstringInBits(startIndex, kmerLength);
return(true);
}
|