File: ReadsFileParser.cpp

package info (click to toggle)
perm 0.4.0-8
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 976 kB
sloc: cpp: 13,499; makefile: 98; sh: 12
file content (532 lines) | stat: -rw-r--r-- 20,087 bytes
parent folder | download | duplicates (5)
#include "ReadsFileParser.h"

inline bool isNorDot(char c)
{
    return(c == '.' || c == 'N');
}

CReadsFileParser::CReadsFileParser(void)
{
    this->initialization();
}

CReadsFileParser::~CReadsFileParser(void)
{
    ifile.close();
    delete this->pBuf;
    // delete this->pOBuf; // Don't delete it. It is setup/destroy from outside.
}

int CReadsFileParser::initialization(void)
{
    this->pBuf = NULL;
    this->pOBuf = NULL;
    this->cFileType = 'F';
    myStrCpy(this->caNextRead,"\0", FILENAME_MAX);
    myStrCpy(this->caNextReadTag, "\0", FILENAME_MAX);
    myStrCpy(this->caNextReadQSs,"\0", FILENAME_MAX);
    myStrCpy(this->InputFile, "\0", FILENAME_MAX);
    this->uiRead_Length = 0;
    this->readStartIndex = 0; // The bases before the start index (5' end) will be removed.
    return(0);
}

char CReadsFileParser::openAFileReady2GetRead\
(const char* filename, const char* fileFormat, unsigned int readStartIndex,
 unsigned int uiRead_Length, bool bDiscardReadsWN, FileOutputBuffer* pBadReadBuf)
{
    this->bDiscardReadWN = bDiscardReadsWN;
    this->readStartIndex = readStartIndex;
    this->uiRead_Length = uiRead_Length;
    this->cFileType = getReadsFileFormatSymbol(filename, fileFormat);
    myStrCpy(this->InputFile, filename, MAX_PATH);

    this->ifile.open(InputFile);
    if (this->ifile.bad() || !fileExist(InputFile)) {
        char tmpWorkDir[MAX_PATH];
        get_working_directory(tmpWorkDir);
        LOG_INFO("\nInfo %d: Cannot open reads file %s in %s\n",\
                 WARNING_LOG, InputFile, tmpWorkDir);
        return(this->cFileType);
    } else {
        this->pBuf = new FileInputBuffer(READS_INPUT_BUFFER_SIZE, &ifile);
        if (this->pBuf == NULL) {
            ERR; // Fail to new FileInputBuffer
            return('N');
        }
        // The initial of the output buffer is outside the class
        this->pOBuf = pBadReadBuf;
    }
    return(this->cFileType);
}

/*
 * This function read in the next read from the inputfile (buffer) this->pBuf.
 * Currently it only read the sequence (no quality) in the following format.
 */
char* CReadsFileParser::get_Next_Read(void)
{
    // If there are still something to read from file or buffer
    // The kmer will be read to this->caNextRead and return it
    this->caNextRead[0] = '\0';
    if (this->pBuf == NULL) {
        LOG_INFO("\nInfo %d: Read File Buffer is NULL\n", WARNING_LOG);
    } else if (this->pBuf->ready2Read()) {
        switch (this->cFileType) {
        case 'F':
            get_Next_Read_From_Fasta();
            break;
        case 'S':
            get_Next_Read_From_csFasta();
            break;
        case 'q':
            get_Next_Read_From_Fastq();
            break;
        case 'Q':
            get_Next_Read_From_csFastq();
            break;
        default:
            get_Next_Read_From_Fasta();
        }
    }
    if (ifile.eof() == true) {
        ifile.close();
    }
    return(this->caNextRead);
}

void CReadsFileParser::print_Next_Read(void)
{
    if (this->pOBuf == NULL) {
        LOG_INFO("\nInfo %d: Read Output Buffer is NULL\n", WARNING_LOG);
    } else {
        unsigned int theReadLength = (unsigned int)strlen(this->caNextRead);
        switch (this->cFileType) {
        case 'q':
            trQScores(theReadLength, SolexaScoreEncodingShift, this->caNextReadQSs, this->caNextReadQSs);
            sprintf(this->pOBuf->caBufp, "@%s\n%s\n+\n%s\n", this->caNextReadTag, this->caNextRead, this->caNextReadQSs);
            this->pOBuf->fflush();
            break;
        case 'Q':
            trQScores(theReadLength, Phred_SCALE_QUAL_SHIFT, this->caNextReadQSs, this->caNextReadQSs);
            sprintf(this->pOBuf->caBufp, "@%s\n%s\n+\n%s\n", this->caNextReadTag, this->caNextRead, this->caNextReadQSs);
            this->pOBuf->fflush();
            break;
        case 'F':
        case 'S':
        default:
            sprintf(this->pOBuf->caBufp, ">%s\n%s\n", this->caNextReadTag, this->caNextRead);
            this->pOBuf->fflush();
        }
    }
}

char* CReadsFileParser::get_Next_Read_From_Fasta(void)
{
    char caBuf[ MAX_CHAR_PER_LINE ];
    char* pch;

    while (1) { // Line which don't start with '>' or nucleotide symbol will be ignored.
        if (this->pBuf->Getline(caBuf, MAX_CHAR_PER_LINE - 1) == 0) {
            this->caNextRead[0] = '\0'; // It is file end
            break;
        } else {
            pch = strtok(caBuf, " ,\t\n"); //This should be the name
            if (pch == NULL) {
                this->caNextRead[0] = '\0'; // It is file end
                break;
            } else if (pch[0] == '>' && !ifile.eof()) {
                // TODO fix the problem when a space after '>' by getReadIdFromTagLine()
                myStrCpy(this->caNextReadTag, pch, FILENAME_MAX);
                continue;
            } else if ( !isNucleotide(pch[0]) && !isNorDot(pch[0]) && !ifile.eof()) {
                continue;
            }
            unsigned int line_length = (unsigned int)strlen(pch);
            if (line_length >= uiRead_Length) {
                const char* readSeq = &(pch[this->readStartIndex]);
                strncpy(this->caNextRead, readSeq, uiRead_Length);
                this->caNextRead[uiRead_Length] = '\0';
                break;
            } else if (line_length < uiRead_Length) {
                // May be the read is too long so is print in the next line
                this->pBuf->Getline(&caBuf[line_length], MAX_CHAR_PER_LINE - line_length);
                line_length = (unsigned int)strlen(caBuf);
                if (line_length < uiRead_Length) {
                    LOG_INFO("Info %d: Read %s %s is %u bp < expected length %u bp.\n",\
                             WARNING_LOG, this->caNextReadTag, caBuf, line_length, uiRead_Length);
                    this->caNextRead[0] = '\0';
                    continue;
                } else if (line_length > uiRead_Length) {
                    int minUiReadLength = (int)uiRead_Length;
                    int iBufSize = MAX_LINE;
                    if ( minUiReadLength >= iBufSize) {
                        minUiReadLength = iBufSize - 1;
                    }
                    strncpy(this->caNextRead, caBuf, minUiReadLength);
                    this->caNextRead[minUiReadLength] = '\0';
                    break;
                } else {
                    myStrCpy(this->caNextRead, pch, FILENAME_MAX);
                    break;
                }
            }
        }
    }
    return(this->caNextRead);
}

#define GET_LINE_UNTIL(pBuf, caBuf, syntex) {\
   while (1) {\
        if(pBuf->Getline(caBuf, MAX_CHAR_PER_LINE - 1) == 0) {\
            return(false);\
        }\
        syntex;\
   }\
   return(true);\
}

inline bool CReadsFileParser::getNextSeqNameInFq(FileInputBuffer* pBuf, char* caBuf)
{
    GET_LINE_UNTIL(pBuf, caBuf, {
        if ( caBuf[0] == '@' && !ifile.eof()) {
            myStrCpy(this->caNextReadTag, caBuf, FILENAME_MAX);
            return(true);
        } else if (caBuf[0] == '>' ) {
            LOG_INFO("Info %d: Invalid fastq file. Is it a fasta file?\n", WARNING_LOG);
            return(false);
        }
    })
}
inline bool CReadsFileParser::getNextLine(FileInputBuffer* pBuf, char* caBuf, const char exp1stChar)
{
    GET_LINE_UNTIL(pBuf, caBuf, {
        if ( caBuf[0] == exp1stChar && !ifile.eof()) {
            return(true);
        }
    })
}

inline bool CReadsFileParser::getNextSeqInFq
(FileInputBuffer* pBuf, char* caBuf, unsigned int expLength)
{
    /*
     * The while loop provide certain input error correction that will read until
    * a line that start with a nucleotide or a dot.
    */
    while (1) {
        if(pBuf->Getline(caBuf, MAX_CHAR_PER_LINE - 1) == 0) {
            return(false);
        }
        unsigned int line_length = 0;
        if ((isNucleotide(caBuf[0]) || isNorDot(caBuf[0])) && !ifile.eof()) {
            line_length = (unsigned int)strlen(caBuf);
            // TODO: double check when the readStartIndex > 0. (line_length - this->readStartIndex >= expLength ?)
            if ( line_length >= expLength && line_length > this->readStartIndex ) {
                unsigned int theReadLength = min(expLength, MAX_LONG_READ_LENGTH);
                myStrCpy(this->caNextRead, &caBuf[this->readStartIndex], MAX_LONG_READ_LENGTH + 1);
                this->caNextRead[theReadLength] = '\0';
                return(true);
            } else {
                if(this->pOBuf == NULL) {
                    LOG_INFO("Info %d: Read %s:%s has length %d < the expected %u bp\n",\
                             WARNING_LOG,this->caNextReadTag, caBuf, line_length, uiRead_Length);
                }
                // Keep the shorter read in buffer
                myStrCpy(this->caNextRead, &caBuf[this->readStartIndex], MAX_LONG_READ_LENGTH + 1);
                unsigned int theReadLength = min(line_length, MAX_LONG_READ_LENGTH);
                this->caNextRead[theReadLength] = '\0';
                return(false);
            }
        }
    }
}

inline bool CReadsFileParser::getNextQScoreInFq(FileInputBuffer* pBuf, char* caBuf, unsigned int expLength)
{
    GET_LINE_UNTIL(pBuf, caBuf, {
        unsigned int line_length = (unsigned int)strlen(caBuf);
        if (line_length > this->readStartIndex) {
            if (line_length >= expLength) {
                return(true);
            } else {
                return(false);
            }
        } else {
            return(false);
        }
    })
}

char* CReadsFileParser::get_Next_Read_From_Fastq(void)
{
    char caBuf[ MAX_CHAR_PER_LINE ];
    bool bHasReadName = getNextSeqNameInFq(this->pBuf, caBuf);
    bool bHasLongReadSeq = getNextSeqInFq(this->pBuf, caBuf, this->uiRead_Length);
    bool bHas3rdLine = getNextLine(this->pBuf, caBuf,'+');
    bool bHasLongEnoughQualityScore = getNextQScoreInFq(this->pBuf, caBuf, this->uiRead_Length + this->readStartIndex);
    if ( bHasReadName && bHasLongReadSeq && bHas3rdLine && bHasLongEnoughQualityScore) {
        const char* qScoreBuf = &caBuf[this->readStartIndex];
        trQScores(this->uiRead_Length, -1 * SolexaScoreEncodingShift, qScoreBuf, this->caNextReadQSs);
        // sucessfully get a read
    } else if (!bHasLongReadSeq) { // Reads shorter than expected
        const char* qScoreBuf = &caBuf[this->readStartIndex];
        unsigned int shorterReadLength = (unsigned int)strlen(this->caNextRead);
        trQScores(shorterReadLength, -1 * SolexaScoreEncodingShift, qScoreBuf, this->caNextReadQSs);
        caNextRead[shorterReadLength] = '\0';
    } else if (!bHasLongEnoughQualityScore) { // Not long enough quality score
        for(unsigned int i = 0; i < this->uiRead_Length; i++)
            this->caNextReadQSs[i] = -1 * SolexaScoreEncodingShift; // use sudo score
    } else { // if EOF (file end)
        caNextRead[0] = '\0';
        ifile.close();
    }
    return(this->caNextRead);
}

char* CReadsFileParser::get_Next_Read_From_csFastq(void)
{
    /*
    char caBuf[ MAX_CHAR_PER_LINE ];
    if (getNextSeqNameInFq(this->pBuf, caBuf) &&\
            getNextSeqInFq(this->pBuf, caBuf, this->uiRead_Length + 1) &&\
            getNextLine(this->pBuf, caBuf,'+') &&\
            getNextQScoreInFq(this->pBuf, caBuf, this->uiRead_Length)) {
        trQScores(this->uiRead_Length, -1 * Phred_SCALE_QUAL_SHIFT, caBuf, this->caNextReadQSs);
        // sucessfully get a read
    } else { // if EOF (file end)
        caNextRead[0] = '\0';
        ifile.close();
    }*/
    char caBuf[ MAX_CHAR_PER_LINE ];
    bool bHasReadName = getNextSeqNameInFq(this->pBuf, caBuf);
    bool bHasLongReadSeq = getNextSeqInFq(this->pBuf, caBuf, this->uiRead_Length + 1);
    bool bHas3rdLine = getNextLine(this->pBuf, caBuf,'+');
    bool bHasLongEnoughQualityScore = getNextQScoreInFq(this->pBuf, caBuf, this->uiRead_Length + this->readStartIndex);
    if ( bHasReadName && bHasLongReadSeq && bHas3rdLine && bHasLongEnoughQualityScore) {
        const char* qScoreBuf = &caBuf[this->readStartIndex];
        trQScores(this->uiRead_Length, -1 * Phred_SCALE_QUAL_SHIFT, qScoreBuf, this->caNextReadQSs);
        // sucessfully get a read
    } else if (!bHasLongReadSeq) { // Reads shorter than expected
        unsigned int shorterReadLength = (unsigned int)strlen(this->caNextRead);
        trQScores(shorterReadLength, -1 * Phred_SCALE_QUAL_SHIFT, caBuf, this->caNextReadQSs);
        caNextRead[shorterReadLength] = '\0';
    } else if (!bHasLongEnoughQualityScore) { // Not long enough quality score
        for(unsigned int i = 0; i < this->uiRead_Length; i++)
            this->caNextReadQSs[i] = -1 * Phred_SCALE_QUAL_SHIFT; // use sudo score
    } else { // if EOF (file end)
        caNextRead[0] = '\0';
        ifile.close();
    }
    return(this->caNextRead);
}

inline char getNtBaseFromSOLiDRead(const char* SOLiDRead, int position)
{
    const char* colors = &(SOLiDRead[1]);
    char nt = SOLiDRead[0];
    return(getBaseFromColors(nt, colors, position));
}

char* CReadsFileParser::get_Next_Read_From_csFasta(void)
{
    // A polished version of et_Next_Read_From_fasta

    char caBuf[ MAX_CHAR_PER_LINE ];
    while (1) {
        caBuf[0] = '\0';
        //Note this->pBuf->Getline() will return 0 if EOF is meet O
        if (this->pBuf->Getline(caBuf, MAX_CHAR_PER_LINE - 1) == 0) {
            this->caNextRead[0] = '\0'; // It is file end
            ifile.close();// Must close the file
            break;
        } else {
            //If this line is header, new line, comment or null line however not EOF, read the next line
            if (caBuf[0] == '>' && !ifile.eof() ) {
                myStrCpy(this->caNextReadTag, caBuf, FILENAME_MAX);
                continue;
            } else if ( !isNucleotide(caBuf[0]) && !isNorDot(caBuf[0]) && !ifile.eof()) {
                continue;
            } else {
                unsigned int line_length = (unsigned int)strlen(caBuf);
                // Note the first base is from the primer not chromosome
                // The read length is counted as the # of colors 0, 1, 2, 3
                if (line_length == (uiRead_Length + 1)) {
                    myStrCpy(this->caNextRead, caBuf, MAX_LINE);
                    break;
                } else if (line_length > (uiRead_Length + 1)) {
                    char firstBase = getNtBaseFromSOLiDRead(caBuf, this->readStartIndex);
                    char* readsStartInBuf = &caBuf[this->readStartIndex];
                    *readsStartInBuf = firstBase; // Truncated the prefix for SOLiD read
                    strncpy(this->caNextRead, readsStartInBuf, uiRead_Length + 1);
                    this->caNextRead[uiRead_Length + 1] = '\0';
                    break;
                } else {
                    LOG_INFO("Info %d: Read %s %s is %u bp < expected length %u bp.\n",\
                             WARNING_LOG, this->caNextReadTag, caBuf, line_length, uiRead_Length);
                    this->caNextRead[0] = '\0';
                    continue;
                }
            }
        }
    }
    return(this->caNextRead);
}

/*
unsigned int estimateNoOfReads(const char* fileName, const char* fileFormat)
{
    char fileType = getReadsFileFormatSymbol(fileName, fileFormat);
    switch (fileType) {
    case 'F':
    case 'S':
        return(estimateNoOfReads_From_Fasta(fileName));
    case 'Q':
    case 'q':
        return(estimateNoOfReads_From_Fastq(fileName));
    default:
        break;
    }
    return (MAX_READ_SET_CAPACITY);
}

unsigned int estimateNoOfReads_From_Fasta(const char* fileName)
{
    // Assume each line has a header
    unsigned int uiNoOfRead = (unsigned int)getNumberOfLineInAFile(fileName) / 2;
    return(uiNoOfRead);
}

unsigned int estimateNoOfReads_From_Fastq(const char* fileName)
{
    // Assume each line has a header
    if (fileExist(fileName)) {
        unsigned int uiNoOfRead = (unsigned int)getNumberOfLineInAFile(fileName) / 4;
        return(uiNoOfRead);
    } else {
        return(0);
    }
}
*/

// check if the format string is for fastq
inline bool isFqFormatStr(const char* fileFormat)
{
    bool bFqFormatStr = (strncmp(fileFormat, "fq", 2) == 0) || \
                        (strncmp(fileFormat, ".fq", 3) == 0) || \
                        (strncmp(fileFormat, "fastq", 5) == 0) ||\
                        (strncmp(fileFormat, ".fastq", 6) == 0);
    return(bFqFormatStr);
}

// Return true fo csfastq or csfq. Judge by content for fastq.
bool is_csFastq_format(const char* fileName, const char* fileFormat)
{
    if (hasCsfqExtName(fileName) || hasCsfqExtName(fileFormat)) {
        return(true);
    }
    bool returnValue = false;
    if (hasFqExtName(fileName)) {
        // A simple check if it is a fastq format by
        if (isFqFormatStr(fileName)) {
            ; // this is a format string setting. It should be fastq format.
        } else if ( fileExist(fileName) ) {
            ifstream ifile(fileName);
            char caBuf[MAX_LINE];
            do {
                caBuf[0] = '\0';
                ifile.getline(caBuf, MAX_LINE);
                if (caBuf[0] == '@') {
                    ifile.getline(caBuf, MAX_LINE);
                    returnValue = isACGT(caBuf[0]) && is0123(caBuf[1]);
                    break;
                }
            } while (!ifile.eof());
            ifile.close();
        } else {
            string msg = "Can't decide the format based on the ext name.";
            LOG_INFO("\nInfo %d: %s doesn't exist.\n %s",\
                     ERROR_LOG, fileName, msg.c_str());
        }
    }
    return (returnValue);
}

char getReadsFileFormatSymbol(string InputFile, string fileFormat)
{
    // The fileFormat string setting can overwrite the previous setting
    char formatSymbol = getReadsFileFormatSymbol(fileFormat.c_str());
    if (formatSymbol == 'N') {
        formatSymbol = getReadsFileFormatSymbol(InputFile.c_str());
        if (formatSymbol == 'N') {
            LOG_INFO("Info %d: Unknown reads format.\n", WARNING_LOG);
        }
    }
    return(formatSymbol);
}

char getReadsFileFormatSymbol(const char* InputFile, const char* fileFormat)
{
    string formatStr;
    if (strcmp(InputFile, fileFormat) == 0 || fileFormat[0] == '\0') {
        string inputFileStr(InputFile);
        return(getReadsFileFormatSymbol(inputFileStr, inputFileStr));
    } else { // in case it is fastq, fastq without '.', add a '.' for
        string formatStr = string (getExtName(fileFormat));
        if (formatStr[0] != '.') {
            formatStr = string(".").append(InputFile);
        }
        return(getReadsFileFormatSymbol(string(InputFile), string(formatStr)));
    }
}

char getReadsFileFormatSymbol(const char* fileName)
{
    if (hasTheExtName(fileName, ".fasta") ||
            hasTheExtName(fileName, ".fa") ||
            hasTheExtName(fileName, ".mfa") ||
            hasTheExtName(fileName, ".fna")) {
        return('F');
    } else if (hasTheExtName(fileName, ".csfasta") ||
               hasTheExtName(fileName, ".csfa")) {
        return('S');
    } else if (is_csFastq_format(fileName)) {
        // Note csFastq format can have a ext name .fastq or .fq
        return('Q');
    } else if (hasTheExtName(fileName, ".fastqsanger") ||
               hasTheExtName(fileName, ".fastq") ||
               hasTheExtName(fileName, ".fq")) {
        return('q');
    } else {
        return('N');
    }
}

bool is_colorspace_reads(const char* fileName)
{
    char fileFormat = getReadsFileFormatSymbol(fileName);
    return(fileFormat == 'S' || fileFormat == 'Q');
}

void getReadsFileFormat(const char* fileName, char* fileFormat)
{
    const char formatSymbol = getReadsFileFormatSymbol(fileName, fileFormat);
    switch (formatSymbol) {

    case 'F':
        strcpy(fileFormat, "fasta");
        break;
    case 'q':
        strcpy(fileFormat, "fastq");
        break;
    case 'Q':
        strcpy(fileFormat, "csfastq");
        break;
    case 'S':
        strcpy(fileFormat, "csfasta");
        break;
    default:
        LOG_INFO("Info %d: Unknown read file format.\n", WARNING_LOG);
    }
}