File: GenomeInBits.cpp

package info (click to toggle)
perm 0.4.0-8
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 976 kB
sloc: cpp: 13,499; makefile: 98; sh: 12
file content (191 lines) | stat: -rw-r--r-- 7,802 bytes
parent folder | download | duplicates (5)
#include "GenomeInBits.h"

CGenomeInBits::CGenomeInBits(unsigned int uiGenomeSize)
{
    this->initialization(uiGenomeSize);
}

CGenomeInBits::~CGenomeInBits(void)
{
    delete [] this->pLowerBits;
    delete [] this->pUpperBits;
    delete this->pNBits;
    // don't delete this->pGenome.
}

CGenomeInBits::CGenomeInBits(CGenomeNTdata* pGenome)
{
    this->initialization(pGenome->iGenomeSize);
    this->pGenome = pGenome;
    // If there are non-ACGT characters in a chromosome, the corresponding bits in memory will be set to A (00).

    for (unsigned int chrId = 0; chrId < this->pGenome->iNo_of_chromosome; chrId++) {
        CchromosomeNTdata* chr = this->pGenome->paChromosomes[chrId];
        // (1) Encode the junction of each chromosome
        encodeJunction(chrId);

        // (2) Encode the middle part of the chromosome
        unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
        unsigned int blockId = (uiGenomeLoucs / wordSize) + 1; // start at the second encoding block for the chromosome
        uiGenomeLoucs = blockId * wordSize ; // The start genome Locus Id of the block
        unsigned int chrLocusId = this->pGenome->genomeLocusID2chrIndex(uiGenomeLoucs);
        unsigned int i;
        // i is the block local Id in the chromosome. blockId is the global bloack Id
        for (i = 1; chrLocusId + i * wordSize < chr->iChromosome_size; blockId++, i++) {
            encodeReadNasA(&(chr->caChromosome[chrLocusId + (i - 1) * wordSize]), wordSize,
                           &this->pUpperBits[blockId], &this->pLowerBits[blockId]);
        }
        // (3) Encode the tail of the chromosome (won't be empty)
        encodeReadNasA(&(chr->caChromosome[chrLocusId + (i - 1) * wordSize]), chr->iChromosome_size - (chrLocusId + (i - 1) * wordSize),
                       &this->pUpperBits[blockId], &this->pLowerBits[blockId]);
    }
    // (4) Encode bits N in a boolFlagArray
    unsigned int noOfNInGenome = 0;
    for (unsigned int chrId = 0; chrId < this->pGenome->iNo_of_chromosome; chrId++) {
        CchromosomeNTdata* chr = this->pGenome->paChromosomes[chrId];
        unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
        for (unsigned int i = 0; i < chr->iChromosome_size; i++, uiGenomeLoucs++) {
            if (!isACGT(chr->caChromosome[i])) {
                this->pNBits->setflag(uiGenomeLoucs, true);
                noOfNInGenome ++;
            }
        }
    }
    if ( noOfNInGenome > 0) {
        LOG_INFO("Info %d: There are %u N in the genome.\r", INFO_LOG, noOfNInGenome);
    }
}

int CGenomeInBits::initialization(unsigned int uiGenomeSize)
{
    this->pNBits = NULL;
    this->pLowerBits = NULL;
    this->pUpperBits = NULL;
    this->pGenome = NULL;
    caSubstring[0] = '\0';
    return (allocBitStrSpace(uiGenomeSize));
}

int CGenomeInBits::allocBitStrSpace(unsigned int uiGenomeSize)
{
    if (uiGenomeSize > 0) {
        if (this->pLowerBits == NULL && this->pUpperBits == NULL && this->pNBits == NULL) {
            this->uiGenomeLength = uiGenomeSize;
            this->uiGenomeLengthInWordSize = (uiGenomeLength - 1) / wordSize + 2; // add one more
            this->pLowerBits = new WORD_SIZE[uiGenomeLengthInWordSize];
            this->pUpperBits = new WORD_SIZE[uiGenomeLengthInWordSize];
            this->pNBits = new CboolFlagArray(uiGenomeLength);
            memset(pLowerBits, 0x00, uiGenomeLengthInWordSize);
            memset(pUpperBits, 0x00, uiGenomeLengthInWordSize);
        } else {
            ERR;
        }
    }
    return(0);
}

int CGenomeInBits::encodeJunction(unsigned int chrId)
{
    char region[wordSize + 1];
    unsigned int uiGenomeLoucs = this->pGenome->chrIndex2genomelocusID(chrId, 0);
    unsigned int blockId = uiGenomeLoucs / wordSize;

    if (chrId == 0) {
        strncpy(region, this->pGenome->paChromosomes[chrId]->caChromosome, wordSize);
    } else {
        unsigned int tailLength = uiGenomeLoucs % wordSize; // tail of the previous chromosome in the junction block
        CchromosomeNTdata* pChr;
        pChr = this->pGenome->paChromosomes[chrId - 1];
        strncpy(region, &(pChr->caChromosome[pChr->iChromosome_size - tailLength]), tailLength);
        pChr = this->pGenome->paChromosomes[chrId];
        strncpy(&region[tailLength], pChr->caChromosome, wordSize - tailLength);
    }
    region[wordSize] = '\0';
    encodeReadNasA(region, wordSize, &this->pUpperBits[blockId], &this->pLowerBits[blockId]);
    return(0);
}

/* Note the chromosome encoding is NOT continuous. The more significant bits of each words encodes
 * nucleotides in front of those less significant bits. This encodeing make bits incontinuously mapped
 * to chromosome index. Note the first bit of each WORD represent the first base of each section
 */
CReadInBits CGenomeInBits::getSubstringInBits(unsigned int uiGenomeIndex) const
{
    CReadInBits r;
    unsigned int indexInWords = uiGenomeIndex / wordSize;
    unsigned int bitsShift = uiGenomeIndex % wordSize;
    if (this->uiGenomeLengthInWordSize > indexInWords) {
        r.UpperBits = this->pUpperBits[indexInWords] >> bitsShift;
        r.LowerBits = this->pLowerBits[indexInWords] >> bitsShift;
        if (bitsShift != 0) {
            r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShift));
            r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShift));
        }
    } else
        LOG_INFO("\nInfo %d: wrong genome index.\n", WARNING_LOG);
    return(r);
}

// eliminate the tail bits out of read length range
CReadInBits CGenomeInBits::getSubstringInBits\
(unsigned int uiGenomeIndex, unsigned int uiSubstringLength) const
{
    CReadInBits r;
    if(uiGenomeIndex < this->uiGenomeLength) {
        unsigned int indexInWords = uiGenomeIndex / wordSize;
        unsigned int bitsShift = uiGenomeIndex % wordSize;
        r.UpperBits = this->pUpperBits[indexInWords] >> bitsShift;
        r.LowerBits = this->pLowerBits[indexInWords] >> bitsShift;
        if (bitsShift != 0) {
            r.UpperBits |= (this->pUpperBits[indexInWords + 1] << (wordSize - bitsShift));
            r.LowerBits |= (this->pLowerBits[indexInWords + 1] << (wordSize - bitsShift));
        }
        unsigned int elimatedBitsNo = wordSize - uiSubstringLength;
        r.UpperBits <<= elimatedBitsNo;
        r.LowerBits <<= elimatedBitsNo;
        r.UpperBits >>= elimatedBitsNo;
        r.LowerBits >>= elimatedBitsNo;
    } else {
        cout << "Access outside of the genomeInBits" << endl;
    }
    return (r);
}

char* CGenomeInBits::getSubstring(unsigned int uiGenomeIndex)
{
    CReadInBits r = getSubstringInBits(uiGenomeIndex);
    decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits);
    return (caSubstring);
}

char* CGenomeInBits::getSubstring(unsigned int uiGenomeIndex, unsigned int uiSubstringLength)
{
    CReadInBits r = getSubstringInBits(uiGenomeIndex);
    decodeRead(caSubstring, wordSize, r.UpperBits, r.LowerBits);
    if (uiSubstringLength <= wordSize) {
        caSubstring[uiSubstringLength] = '\0';
    }

    if (uiGenomeIndex +  uiSubstringLength > uiGenomeLength) {
        if (uiGenomeLength > uiGenomeIndex)
            caSubstring[uiGenomeLength - uiGenomeIndex] = '\0';
        else {
            LOG_INFO("\nInfo %d: wrong genome index.\n", WARNING_LOG);
            caSubstring[0] = '\0';
        }
    }
    return (caSubstring);
}

bool CGenomeInBits::fragACGTKmerInBits(CReadInBits& kmerInBits, int startIndex, int kmerLength)
{
    for (int i = 0; i < kmerLength; i++) {
        int baseGenomeIndex = i + startIndex;
        if (this->pNBits->b(baseGenomeIndex)) { // Meet N
            return(false);
        }
    }
    kmerInBits = getSubstringInBits(startIndex, kmerLength);
    return(true);
}