1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
|
#include<algorithm>
#include<fstream>
#include<set>
#include<sstream>
#include "TranscriptSequence.h"
#include "misc.h"
#include "common.h"
// Number of times we randomly probe for old cache record.
// CR: #define WORST_SEARCH_N 10
TranscriptSequence::TranscriptSequence(){//{{{
// CR: srand(time(NULL));
M=0;
cM=0;
gotGeneNames=false;
// CR: useCounter = 0;
}//}}}
TranscriptSequence::TranscriptSequence(string fileName, refFormatT format){//{{{
TranscriptSequence();
readSequence(fileName,format);
}//}}}
bool TranscriptSequence::readSequence(string fileName, refFormatT format){//{{{
fastaF.open(fileName.c_str());
if(!fastaF.is_open()){
error("TranscriptSequence: problem reading transcript file.\n");
return false;
}
trSeqInfoT newTr;
// CR: newTr.lastUse=0;
// CR: newTr.cache=-1;
string trDesc,geneName;
long pos;
istringstream geneDesc;
trNames.clear();
geneNames.clear();
gotGeneNames = true;
// Record trNames only from gencode ref.
gotTrNames = (format == GENCODE);
while(fastaF.good()){
while((fastaF.peek()!='>')&&(fastaF.good()))
fastaF.ignore(1000,'\n');
if(! fastaF.good())break;
// Read description line:
getline(fastaF, trDesc, '\n');
// look for gene name if previous lines had gene name:
if(gotGeneNames){
if(format == GENCODE){
vector<string> lineTokens = ns_misc::tokenize(trDesc,"|");
if(lineTokens.size()>1){
geneNames.push_back(lineTokens[1]);
trNames.push_back(lineTokens[0].substr(1));
}else{
gotGeneNames = false;
gotTrNames = false;
}
}else{ // format == STANDARD
pos=min(trDesc.find("gene:"),trDesc.find("gene="));
if(pos!=(long)string::npos){
geneDesc.clear();
geneDesc.str(trDesc.substr(pos+5));
geneDesc >> geneName;
geneNames.push_back(geneName);
}else{
gotGeneNames = false;
}
}
}
// remember position:
newTr.seek=fastaF.tellg();
trs.push_back(newTr);
}
// Exit if there was an error while reading the file.
if(fastaF.bad()){
error("TranscriptSequence: problem reading file.\n");
return false;
}
M = trs.size();
// Allocate cache for all.
cache.resize(M);
//cache.resize(min(M,(long)TRS_CACHE_MAX));
//cachedTrs.resize(min(M,(long)TRS_CACHE_MAX));
// Clear eof flag from input stream.
fastaF.clear();
return loadSequence();
}//}}}
bool TranscriptSequence::loadSequence(){//{{{
cache.resize(M);
string seqLine;
for(long tr=0;tr<M;tr++){
// Set input stream to transcript's position.
fastaF.seekg(trs[tr].seek);
// Read line by line until reaching EOF or next header line '>'.
while((fastaF.peek()!='>')&&( getline(fastaF,seqLine,'\n').good())){
cache[tr]+=seqLine;
}
if(fastaF.bad()){
error("TranscriptSequence: Failed reading transcript %ld\n",tr);
return false;
}
// Clear flags (just in case).
fastaF.clear();
}
return true;
}//}}}
long TranscriptSequence::getG() const{//{{{
if(!gotGeneNames)return 0;
return (set<string>(geneNames.begin(),geneNames.end())).size();
}//}}}
const string &TranscriptSequence::getTr(long tr) const{//{{{
if((tr<0)||(tr>=M))return noneTr;
// Return pointer to the sequence in cache.
return cache[tr];
/* Used with cacheing. {{{
// Update last use info.
trs[tr].lastUse = useCounter++;
return cache[acquireSequence(tr)];
}}} */
}//}}}
string TranscriptSequence::getSeq(long trI,long start,long l,bool doReverse) const{//{{{
// Return empty string for unknown transcript.
if((trI<0)||(trI>=M))return "";
/* Used with cacheing. {{{
// Update last use info.
trs[tr].lastUse = useCounter++;
// Get position within cache.
long trI = acquireSequence(tr);
}}} */
// If position is not within the sequence, return Ns.
if(start>=(long)cache[trI].size())return string(l,'N');
string ret;
// Copy appropriate sequence, fill up the rest with Ns.
if(start<0){
ret.assign(-start,'N');
ret+=cache[trI].substr(0,l+start);
}else{
ret = cache[trI].substr(start,l);
if(((long)ret.size()) < l)ret.append(l-ret.size(), 'N');
}
if(!doReverse){
return ret;
}else{
// For reverse return reversed string with complemented bases.
reverse(ret.begin(),ret.end());
for(long i=0;i<l;i++)
if((ret[i]=='A')||(ret[i]=='a'))ret[i]='T';
else if((ret[i]=='T')||(ret[i]=='t'))ret[i]='A';
else if((ret[i]=='C')||(ret[i]=='c'))ret[i]='G';
else if((ret[i]=='G')||(ret[i]=='g'))ret[i]='C';
return ret;
}
}//}}}
/* long TranscriptSequence::acquireSequence(long tr){//{{{
// If the sequence is stored in cache then just return it's cache index.
if(trs[tr].cache!=-1)return trs[tr].cache;
long i,newP,j;
// See if cache is full.
if(cM<TRS_CACHE_MAX){
// If cache limit not reached, just add new sequence.
newP=cM;
cM++;
}else{
// If cache is full, look at WORST_SEARCH_N positions and choose the one least used.
newP=rand()%cM;
for(i=0;i<WORST_SEARCH_N;i++){
j=rand()%cM;
if(trs[cachedTrs[newP]].lastUse > trs[cachedTrs[j]].lastUse)newP=j;
}
// "remove" the transcript from position newP from cache.
trs[cachedTrs[newP]].cache=-1;
cache[newP].clear();
}
// Set input stream to transcript's position.
fastaF.seekg(trs[tr].seek);
string seqLine;
// Read line by line until reaching EOF or next header line '>'.
while((fastaF.peek()!='>')&&( getline(fastaF,seqLine,'\n').good())){
cache[newP]+=seqLine;
}
if(fastaF.bad()){
error("TranscriptSequence: Failed reading transcript %ld\n",tr);
return 0;
}
// Clear flags.
fastaF.clear();
// Update cache information.
cachedTrs[newP]=tr;
trs[tr].cache=newP;
// Return transcripts index within cache.
return newP;
}//}}} */
|