1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
|
#ifndef GFASEQGET_H
#define GFASEQGET_H
#include "GFastaIndex.h"
#define MAX_FASUBSEQ 0x20000000
//max 512MB sequence data held in memory at a time
class GSubSeq {
public:
uint sqstart; //1-based coord of subseq start on sequence
uint sqlen; //length of subseq loaded
char* sq; //actual subsequence data will be stored here
// (with end-of-line characters removed)
/*char* xseq; //the exposed pointer to the last requested subsequence start
off_t xstart; //the coordinate start for the last requested subseq
off_t xlen; //the last requested subseq len*/
GSubSeq() {
sqstart=0;
sqlen=0;
sq=NULL;
/* xseq=NULL;
xstart=0;
xlen=0;*/
}
void forget() { //forget about pointer data, so we can reuse it
sq=NULL;
sqstart=0;
sqlen=0;
}
~GSubSeq() {
GFREE(sq);
}
// genomic, 1-based coordinates:
void setup(uint sstart, int slen, int sovl=0, int qfrom=0, int qto=0, uint maxseqlen=0);
//check for overlap with previous window and realloc/extend appropriately
//returns offset from seq that corresponds to sstart
// the window will keep extending until MAX_FASUBSEQ is reached
};
//
class GFaSeqGet {
char* fname; //file name where the sequence resides
FILE* fh;
off_t fseqstart; //file offset where the sequence actually starts
uint seq_len; //total sequence length, if known (when created from GFastaIndex)
uint line_len; //length of each line of text
uint line_blen; //binary length of each line
// = line_len + number of EOL character(s)
GSubSeq* lastsub;
void initialParse(off_t fofs=0, bool checkall=true);
const char* loadsubseq(uint cstart, int& clen);
void finit(const char* fn, off_t fofs, bool validate);
public:
//GStr seqname; //current sequence name
char* seqname;
GFaSeqGet(): fname(NULL), fh(NULL), fseqstart(0), seq_len(0),
line_len(0), line_blen(0), lastsub(NULL), seqname(NULL) {
}
GFaSeqGet(const char* fn, off_t fofs, bool validate=false):fname(NULL), fh(NULL),
fseqstart(0), seq_len(0), line_len(0), line_blen(0),
lastsub(NULL), seqname(NULL) {
finit(fn,fofs,validate);
}
GFaSeqGet(const char* fn, bool validate=false):fname(NULL), fh(NULL),
fseqstart(0), seq_len(0), line_len(0), line_blen(0),
lastsub(NULL), seqname(NULL) {
finit(fn,0,validate);
}
GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen);
//constructor from GFastaIndex record
GFaSeqGet(FILE* f, off_t fofs=0, bool validate=false);
~GFaSeqGet() {
if (fname!=NULL) {
GFREE(fname);
fclose(fh);
}
GFREE(seqname);
delete lastsub;
}
const char* seq(uint cstart=1, int clen=0) {
int cend = clen==0 ? 0 : cstart+clen-1;
return getRange(cstart, cend);
}
const char* subseq(uint cstart, int& clen);
const char* getRange(uint cstart=1, uint cend=0) {
if (cend==0) cend=(seq_len>0)?seq_len : MAX_FASUBSEQ;
if (cstart>cend) { Gswap(cstart, cend); }
int clen=cend-cstart+1;
//int rdlen=clen;
return subseq(cstart, clen);
}
//caller is responsible for deallocating the return string
char* copyRange(uint cstart, uint cend, bool revCmpl=false, bool upCase=false);
//uncached, read and return allocated buffer
//caller is responsible for deallocating the return string
char* fetchSeq(int* retlen=NULL) {
int clen=(seq_len>0) ? seq_len : MAX_FASUBSEQ;
if (lastsub) { delete lastsub; lastsub=NULL; }
subseq(1, clen);
if (retlen) *retlen=clen;
char* r=lastsub->sq;
lastsub->forget();
if (clen>0) {
r[clen]=0;
}
else {
r=NULL;
}
return r;
}
void loadall(uint32 max_len=0) {
//TODO: better read the whole sequence differently here - line by line
//so when EOF or another '>' line is found, the reading stops!
int clen=(seq_len>0) ? seq_len : ((max_len>0) ? max_len : MAX_FASUBSEQ);
subseq(1, clen);
}
void load(uint cstart, uint cend) {
//cache as much as possible
if (seq_len>0 && cend>seq_len) cend=seq_len; //correct a bad request
int clen=cend-cstart+1;
subseq(cstart, clen);
}
int getsublen() { return lastsub!=NULL ? lastsub->sqlen : 0 ; }
int getseqlen() { return seq_len; } //known when loaded with GFastaIndex
off_t getseqofs() { return fseqstart; }
int getLineLen() { return line_len; }
int getLineBLen() { return line_blen; }
//reads a subsequence starting at genomic coordinate cstart (1-based)
};
//multi-fasta sequence handling
class GFastaDb {
public:
char* fastaPath;
GFastaIndex* faIdx; //could be a cdb .cidx file
//int last_fetchid;
const char* last_seqname;
GFaSeqGet* faseq;
//GCdbYank* gcdb;
GFastaDb(const char* fpath=NULL, bool forceIndexFile=true):fastaPath(NULL), faIdx(NULL), last_seqname(NULL),
faseq(NULL) {
//gcdb=NULL;
init(fpath, forceIndexFile);
}
void init(const char* fpath, bool writeIndexFile=true) {
if (fpath==NULL || fpath[0]==0) return;
//last_fetchid=-1;
last_seqname=NULL;
if (!fileExists(fpath))
GError("Error: file/directory %s does not exist!\n",fpath);
fastaPath=Gstrdup(fpath);
//GStr gseqpath(fpath);
if (fileExists(fastaPath)>1) { //exists and it's not a directory
char* fainame=Gstrdup(fastaPath,4);
int fainamelen=strlen(fainame);
//int fainame_len=strlen(fainame);
if (trimSuffix(fastaPath, ".fai")) {
//.fai index file given directly
if (!fileExists(fastaPath))
GError("Error: cannot find fasta file for index %s !\n", fastaPath);
}
else { //append .fai as needed
strcpy(fainame+fainamelen, ".fai");
fainamelen+=4;
}
//GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars());
faIdx=new GFastaIndex(fastaPath, fainame);
char* fainamecwd=fainame; //will hold just the file name without the path
char* plast=strrchr(fainamecwd, '/'); //CHPATHSEP
if (plast!=NULL) {
fainamecwd=plast+1; //point to the file name only
}
if (!faIdx->hasIndex()) { //could not load index file .fai
//try current directory (Warning: might not be the correct index for that file!)
if (plast==NULL) {
if (fileExists(fainamecwd)>1) {
faIdx->loadIndex(fainamecwd);
}
}
} //tried to load index
if (!faIdx->hasIndex()) { //no index file to be loaded, build the index
//if (forceIndexFile)
// GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
faIdx->buildIndex(); //build index in memory only
if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
if (writeIndexFile) {
//GMessage("Fasta index rebuilt.\n");
FILE* fcreate=fopen(fainame, "w");
char* idxfname=fainame;
if (fcreate==NULL) {
GMessage("Warning: cannot create fasta index file %s! (permissions?)\n", fainame);
if (fainame!=fainamecwd) {
//try cwd
idxfname=fainamecwd;
GMessage(" Attempting to create the index in the current directory..\n");
if ((fcreate=fopen(fainamecwd, "w"))==NULL)
GError("Error: cannot create fasta index file %s!\n", fainamecwd);
}
}
if (fcreate!=NULL) {
if (faIdx->storeIndex(fcreate)<faIdx->getCount())
GMessage("Warning: error writing the index file %s!\n", idxfname);
else GMessage("FASTA index file %s created.\n", idxfname);
}
} //file storage of index requested
} //creating FASTA index
GFREE(fainame);
} //multi-fasta file
}
GFaSeqGet* fetchFirst(const char* fname, bool checkFasta=false) {
faseq=new GFaSeqGet(fname, checkFasta);
faseq->loadall();
//last_fetchid=gseq_id;
GFREE(last_seqname);
last_seqname=Gstrdup(faseq->seqname);
return faseq;
}
char* getFastaFile(const char* gseqname) {
if (fastaPath==NULL) return NULL;
int gnl=strlen(gseqname);
char* s=Gstrdup(fastaPath, gnl+8);
int slen=strlen(s);
if (s[slen-1]!='/') {//CHPATHSEP ?
s[slen]='/';
slen++;
s[slen]='\0';
}
//s.append(gseqname);
strcpy(s+slen, gseqname);
slen+=gnl;
if (!fileExists(s)) {
//s.append(".fa")
strcpy(s+slen, ".fa");
slen+=3;
}
if (!fileExists(s)) { strcpy(s+slen, "sta"); slen+=3; }
if (fileExists(s)) return Gstrdup(s);
else {
GMessage("Warning: cannot find genomic sequence file %s/%s{.fa,.fasta}\n",fastaPath, s);
return NULL;
}
GFREE(s);
}
GFaSeqGet* fetch(const char* gseqname) {
if (fastaPath==NULL) return NULL;
if (last_seqname!=NULL && (strcmp(gseqname, last_seqname)==0)
&& faseq!=NULL) return faseq;
delete faseq;
faseq=NULL;
//last_fetchid=-1;
GFREE(last_seqname);
last_seqname=NULL;
//char* gseqname=GffObj::names->gseqs.getName(gseq_id);
if (faIdx!=NULL) { //fastaPath was the multi-fasta file name and it must have an index
GFastaRec* farec=faIdx->getRecord(gseqname);
if (farec!=NULL) {
faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
farec->line_len, farec->line_blen);
faseq->loadall(); //just cache the whole sequence, it's faster
//last_fetchid=gseq_id;
last_seqname=Gstrdup(gseqname);
}
else {
GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
return NULL;
}
}
else { //directory with FASTA files named as gseqname
char* sfile=getFastaFile(gseqname);
if (sfile!=NULL) {
faseq=new GFaSeqGet(sfile);
faseq->loadall();
//last_fetchid=gseq_id;
GFREE(sfile);
}
} //one fasta file per contig
//else GMessage("Warning: fasta index not available, cannot retrieve sequence %s\n",
// gseqname);
return faseq;
}
~GFastaDb() {
GFREE(fastaPath);
GFREE(last_seqname);
//delete gcdb;
delete faIdx;
delete faseq;
}
};
GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, const char* seqid);
#endif
|