1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
|
#pragma once
#ifndef READS_FILE_PARSER_H_
#define READS_FILE_PARSER_H_
#ifndef MAX_PATH
#define MAX_PATH 2048
#endif
#ifndef MAX_CHAR_PER_LINE
const int MAX_CHAR_PER_LINE = 5000;
#endif
const int READS_INPUT_BUFFER_SIZE = 20000000;
#ifndef MAX_READ_SET_CAPACITY
const int MAX_READ_SET_CAPACITY = 30000000;
#endif
#include "ReadsQualScores.h"
#include "Filename.h"
#include "FileInputBuffer.h"
#include "FileOutputBuffer.h"
#include "ShortReadUtil.h"
#include "chdir.h"
#include <iostream>
#include <string>
using namespace std;
/*
* This class provides function to read the next reads from different reads file format.
*/
class CReadsFileParser
{
public:
CReadsFileParser(void);
virtual ~CReadsFileParser(void);
char openAFileReady2GetRead(const char* filename, const char* fileFormat,\
unsigned int readStartIndex, unsigned int uiRead_Length,
bool bDiscardReadsWN, FileOutputBuffer* pBadReadBuf = NULL);
// read in a short read file from different format
virtual char* get_Next_Read(void);
virtual void print_Next_Read(void);
ifstream ifile;
FileInputBuffer* pBuf;
FileOutputBuffer* pOBuf;
char InputFile[MAX_PATH];
char caNextReadTag[MAX_CHAR_PER_LINE];
char caNextRead[MAX_CHAR_PER_LINE];
char caNextReadQSs[MAX_CHAR_PER_LINE];
char cFileType;
bool bDiscardReadWN;
unsigned int readStartIndex;
unsigned int uiRead_Length;
protected:
// get the universal read Id and store in vector
// inline void save_next_read_id(const char* tagLine);
// get a short read from fasta format
char* get_Next_Read_From_Fasta(void);
// get a short read from csfasta format
char* get_Next_Read_From_csFasta(void);
// get a short read from fastq format for Illumina read (seq only)
char* get_Next_Read_From_Fastq(void);
// get a short read from fastq format for SOLid read (seq only)
char* get_Next_Read_From_csFastq(void);
private:
int initialization(void);
// sub functions for get_Next_Read_From_(cs)Fastq
inline bool getNextSeqNameInFq(FileInputBuffer* pBuf, char* caBuf);
inline bool getNextSeqInFq(FileInputBuffer* pBuf, char* caBuf, unsigned int expLength);
inline bool getNextLine(FileInputBuffer* pBuf, char* caBuf, const char exp1stChar);
inline bool getNextQScoreInFq(FileInputBuffer* pBuf, char* caBuf, unsigned int expLength);
};
/*
//estimate number of read
unsigned int estimateNoOfReads(const char* fileName, const char* fileFormat);
unsigned int estimateNoOfReads_From_Fasta(const char* fileName);
unsigned int estimateNoOfReads_From_Fastq(const char* fileName);
*/
void getReadsFileFormat(const char* fileName, char* fileFormat);
char getReadsFileFormatSymbol(const char* InputFile, const char* fileFormat);
char getReadsFileFormatSymbol(const char* InputFile);
bool is_csFastq_format(const char* fileName, const char* fileFormat = "");
bool is_colorspace_reads(const char* fileName);
inline bool hasCsfqExtName(const char* fileName)
{
if (hasTheExtName(fileName, ".csfastq") || \
hasTheExtName(fileName, ".csfq")) {
return(true);
} else {
return(false);
}
}
inline bool hasFqExtName(const char* fileName)
{
bool bFqFormat = hasTheExtName(fileName, ".fq") || \
hasTheExtName(fileName, ".fastq") || \
hasTheExtName(fileName, ".fastqsanger");
return(bFqFormat);
}
#endif
|