File: ReadInBits.cpp

package info (click to toggle)
perm 0.4.0-8
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 976 kB
sloc: cpp: 13,499; makefile: 98; sh: 12
file content (345 lines) | stat: -rw-r--r-- 10,096 bytes
parent folder | download | duplicates (5)
#include "stdafx.h"
#include "ReadInBits.h"

int CReadInBits::iReadLength = 0; //read length must <= WORDSIZE

CReadInBits::CReadInBits(void)
{
}

CReadInBits::~CReadInBits(void)
{
}

CReadInBits::CReadInBits(const char* caRead)
{
    encodeRead(caRead, CReadInBits::iReadLength, &this->UpperBits, &this->LowerBits);
}

CReadInBits::CReadInBits(const char* caRead, int readlength)
{
    encodeRead(caRead, readlength, &this->UpperBits, &this->LowerBits);
}

unsigned int CReadInBits::encode(const char* caRead)
{
    return(encodeRead(caRead, CReadInBits::iReadLength, &this->UpperBits, &this->LowerBits));
}


unsigned int CReadInBits::encode(const char* caRead, int readlength)
{
    return(encodeRead(caRead, readlength, &this->UpperBits, &this->LowerBits));
}

// Encode 'N' as 'A'
unsigned int CReadInBits::encodeRead_NasA(const char* caRead, int readlength)
{
    return(encodeReadNasA(caRead, readlength, &this->UpperBits, &this->LowerBits));
}

char* CReadInBits::decode(char* caRead)
{
    decodeRead(caRead, CReadInBits::iReadLength, this->UpperBits, this->LowerBits);
    return(caRead);
}

int* CReadInBits::decode(int* iaRead) const
{
    decodeRead(iaRead, CReadInBits::iReadLength, this->UpperBits, this->LowerBits);
    return(iaRead);
}

bool CReadInBits::operator==(const CReadInBits &other) const
{
    /*
    int shift = (int)wordSize - iReadLength;
    return((this->UpperBits << shift) == (other.UpperBits << shift) && (this->LowerBits << shift) == (other.LowerBits << shift));
    */
    return(((this->UpperBits ^ other.UpperBits) | (this->LowerBits ^ other.LowerBits)) << (wordSize - iReadLength) == 0);
}

bool CReadInBits::operator<(const CReadInBits &other) const
{

    unsigned int spareTail = wordSize - iReadLength;
    WORD_SIZE u1 = this->UpperBits << spareTail;
    WORD_SIZE u2 = other.UpperBits << spareTail;

    if (u1 < u2) {
        return(true);
    } else if (u1 == u2) {
        return((this->LowerBits << spareTail) < (other.LowerBits << spareTail));
    } else {
        return(false);
    }
}

// uiReadLength must < WORD_SIZE which is 32 bp in 32big machine and 64 bp in 64 bit machine
// Each base is encoded into 2 bits: A -> 00, C->01, G->10 and T->11.
// These two digits are located in two word, for bits operation.
// The first nucleotide is encoded as the last digit.

unsigned int encodeRead(const char* caRead, int uiReadLength, WORD_SIZE* encodUpperBits, WORD_SIZE* encodedLowerBits)
{
    WORD_SIZE UpperBits = 0;
    WORD_SIZE LowerBits = 0;

    int i = uiReadLength - 1;
    do {
        switch (caRead[i]) {
        case 'A':
        case 'a':
            break;
        case 'C':
        case 'c':
            LowerBits ++;
            break;
        case 'G':
        case 'g':
            UpperBits ++;
            break;
        case 'T':
        case 't':
            UpperBits ++;
            LowerBits ++;
            break;
        case 'N':
        case 'n':
        case '.':
            return (1); //invalid read
        default:
            cout << "Unexpected character: " << caRead[i] << BLANK_LINE<< endl;
            return (1); //invalid read
        }
        i--;
        if (i >= 0) {
            UpperBits <<= 1; //shift 1
            LowerBits <<= 1;
        }
    } while ( i >= 0);
    *encodUpperBits = UpperBits;
    *encodedLowerBits = LowerBits;
    return (0);
}

// Encode 'N' as 'A'
unsigned int encodeReadNasA(const char* caRead, int uiReadLength, WORD_SIZE* encodUpperBits, WORD_SIZE* encodedLowerBits)
{
    WORD_SIZE UpperBits = 0;
    WORD_SIZE LowerBits = 0;

    int i = uiReadLength - 1;
    do {
        switch (caRead[i]) {
        case 'A':
        case 'a':
        case 'N':
        case '.':
            break;
        case 'C':
        case 'c':
            LowerBits ++;
            break;
        case 'G':
        case 'g':
            UpperBits ++;
            break;
        case 'T':
        case 't':
            UpperBits ++;
            LowerBits ++;
            break;
        default:
            cout << "Unexpected character: " << caRead[i] << " in " << caRead << endl;
            return (1); //invalid read
        }
        i--;
        if (i >= 0) {
            UpperBits <<= 1; //shift 1
            LowerBits <<= 1;
        }
    } while ( i >= 0);
    *encodUpperBits = UpperBits;
    *encodedLowerBits = LowerBits;
    return (0);
}

unsigned int decodeRead(int* iaRead, int iReadLength, WORD_SIZE UpperBits, WORD_SIZE LowerBits)
{
    int  i;
    for (i = 0; i < iReadLength; i++) {
        WORD_SIZE c = (UpperBits & 0x01) << 1 | (LowerBits & 0x01);
        iaRead[i] = (int)c;
        LowerBits >>= 1;
        UpperBits >>= 1;
    }
    iaRead[i] = -1;
    return 0;
}

unsigned int decodeRead(char* caRead, int iReadLength, WORD_SIZE UpperBits, WORD_SIZE LowerBits)
{
    int  i;
    for (i = 0; i < iReadLength; i++) {
        WORD_SIZE c = (UpperBits & 0x01) << 1 | (LowerBits & 0x01);
        switch (c) {
        case 0x00:
            caRead[i] = 'A';
            break;
        case 0x01:
            caRead[i] = 'C';
            break;
        case 0x02:
            caRead[i] = 'G';
            break;
        case 0x03:
            caRead[i] = 'T';
            break;
        default:
            caRead[i] = 'N';
        }
        LowerBits >>= 1;
        UpperBits >>= 1;
    }
    caRead[i] = '\0';
    return 0;
}

void reverseCompliment(unsigned int uiReadLength, WORD_SIZE* pUpperBits, WORD_SIZE* pLowerBits)
{
    WORD_SIZE LowerBits = ~(*pLowerBits);
    WORD_SIZE UpperBits = ~(*pUpperBits);

#ifdef __32BITS__
    UpperBits = reverse32bits(UpperBits);
    LowerBits = reverse32bits(LowerBits);
    unsigned int shifts = (wordSize - uiReadLength);
#else
    UpperBits = reverse64bits(UpperBits);
    LowerBits = reverse64bits(LowerBits);
    unsigned int shifts = (wordSize - uiReadLength);
#endif
    (*pUpperBits) = UpperBits >> shifts;
    (*pLowerBits) = LowerBits >> shifts;
}

CReadInBits reverseCompliment(unsigned int uiReadLength, CReadInBits r)
{
    reverseCompliment(uiReadLength, &(r.UpperBits), &(r.LowerBits));
    return(r);
}

// return number of bits set
inline unsigned int bitsSetCount(WORD_SIZE bits)
{
    // magic function to caculate how many ones are there
#ifdef WIN32
    unsigned int c; // c accumulates the total bits set in v
    for (c = 0; bits; c++) {
        bits &= bits - 1; // clear the least significant bit set
    }
    return (c);
#else
#ifdef _WIN64
    unsigned int c; // c accumulates the total bits set in v
    for (c = 0; bits; c++) {
        bits &= bits - 1; // clear the least significant bit set
    }
    return (c);
    /*
    bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1)  + (bits & 0x5555555555555555);
    bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2)  + (bits & 0x3333333333333333);
    bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4)  + (bits & 0x0F0F0F0F0F0F0F0F);
    bits = ((bits & 0xFF00FF00FF00FF00) >> 8)  + (bits & 0x00FF00FF00FF00FF);
    bits = ((bits & 0xFFFF0000FFFF0000) >> 16) + (bits & 0x0000FFFF0000FFFF);
    bits = ((bits & 0xFFFFFFFF00000000) >> 32) + (bits & 0x00000000FFFFFFFF);
    return (unsigned int)(bits);
    */
#else
    return(__builtin_popcountll(bits));
#endif
#endif
}

unsigned int bitsStrCompare(CReadInBits r1, CReadInBits r2)
{
    WORD_SIZE bits = (r1.UpperBits ^ r2.UpperBits) | (r1.LowerBits ^ r2.LowerBits);
    return(bitsSetCount(bits));
}

// compare only the last N bases (bits)
unsigned int bitsStrNCompare(CReadInBits r1, CReadInBits r2, unsigned int N)
{
    WORD_SIZE bits = (r1.UpperBits ^ r2.UpperBits) | (r1.LowerBits ^ r2.LowerBits);
    bits <<= (wordSize - N);
    return(bitsSetCount(bits));
    /*
        // magic function to calculate how many ones are there
    #ifdef WIN32
        unsigned int c; // c accumulates the total bits set in v
        for (c = 0; bits; c++) {
            bits &= bits - 1; // clear the least significant bit set
        }
        return (c);
    #else
    #ifdef _WIN64
        unsigned int c; // c accumulates the total bits set in v
        for (c = 0; bits; c++) {
            bits &= bits - 1; // clear the least significant bit set
        }
        return (c);
        bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1)  + (bits & 0x5555555555555555);
        bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2)  + (bits & 0x3333333333333333);
        bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4)  + (bits & 0x0F0F0F0F0F0F0F0F);
        bits = ((bits & 0xFF00FF00FF00FF00) >> 8)  + (bits & 0x00FF00FF00FF00FF);
        bits = ((bits & 0xFFFF0000FFFF0000) >> 16) + (bits & 0x0000FFFF0000FFFF);
        bits = ((bits & 0xFFFFFFFF00000000) >> 32) + (bits & 0x00000000FFFFFFFF);
        return (unsigned int)(bits);

    #else
        return(__builtin_popcountll(bits));
    #endif
    #endif
    */
}

// skip the first M base and compare the following N base pairs
unsigned int bitsStrMNCompare(CReadInBits r1, CReadInBits r2, unsigned int M, unsigned int N)
{
    WORD_SIZE bits = (r1.UpperBits ^ r2.UpperBits) | (r1.LowerBits ^ r2.LowerBits);
    bits >>= M;
    bits <<= (wordSize - N);
    return(bitsSetCount(bits));
}

unsigned int encodeLongRead(const char* read, CReadInBits& firstHalf, CReadInBits& secondHalf)
{
    int readLength = (int)strlen(read);
    int secondHalfStart = readLength - CReadInBits::iReadLength;
    if (read != NULL && readLength > (int)MAX_READ_LENGTH) {
        secondHalf.encode(&read[secondHalfStart]);
        firstHalf.encode(read);
        return(0); // Potential error for buffer overflow
    } else {
        return(1);
    }
}

unsigned int decodeLongRead(CReadInBits& firstHalf, CReadInBits& secondHalf, char* read, bool oddReadLength)
{
    int secondHalfStart;
    if (oddReadLength) {
        secondHalfStart = CReadInBits::iReadLength - 1;
    } else {
        secondHalfStart = CReadInBits::iReadLength;
    }
    if (read != NULL) {
        firstHalf.decode(read);
        secondHalf.decode(&read[secondHalfStart]);
        read[secondHalfStart + CReadInBits::iReadLength] = '\0';
        return(0); // Potential error for buffer overflow
    } else {
        return(1);
    }
}