File: TranscriptInfo.cpp

package info (click to toggle)
bitseq 0.7.5%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 636 kB
  • sloc: cpp: 7,043; python: 562; makefile: 150; sh: 52
file content (258 lines) | stat: -rw-r--r-- 8,428 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#include<fstream>
#include<set>

#include"TranscriptInfo.h"

#include "common.h"

bool TranscriptInfo::writeInfo(string fileName, bool force) const{//{{{
   ofstream trF;
   if(! force){
      // Do nothing if file exists.
      ifstream testF(fileName.c_str());
      if(testF.is_open()){
         testF.close();
         return false;
      }
      testF.close();
   }
   trF.open(fileName.c_str(),ios::out | ios::trunc);
   if(! trF.is_open() ) return false;
   trF<<"# M "<<M<<endl;
   for(long i=0;i<M;i++)
      trF<<transcripts[i].g<<" "<<transcripts[i].t<<" "<<transcripts[i].l<<" "<<transcripts[i].effL<<endl;
   trF.close();
   return true;
}//}}}
bool TranscriptInfo::writeGeneInfo(string fileName) const{//{{{
   ofstream geF;
   geF.open(fileName.c_str(),ios::out | ios::trunc);
   if(! geF.is_open() ) return false;
   geF<<"# G "<<G<<endl;
   geF<<"# <gene name> <# of transcripts> <average length>"<<endl;
   double length;
   for(long i=0;i<G;i++){
      length = 0;
      for(long j=0;j<genes[i].m;j++)length+=transcripts[genes[i].trs[j]].l;
      geF<<genes[i].name<<" "<<genes[i].m<<" "<<length/genes[i].m<<endl;
   }
   geF.close();
   return true;
}//}}}
bool TranscriptInfo::setInfo(vector<string> gNames,vector<string> tNames, vector<long> lengths){//{{{
   // The sizes have to be equal.
   if((gNames.size()!=tNames.size())||(tNames.size()!=lengths.size())) return false;
   transcriptT newT;
   M = (long) gNames.size();
   // Create new entry for each transcript.
   for(long i=0;i<M;i++){
      newT.g=gNames[i];
      newT.t=tNames[i];
      newT.gI = 0;
      newT.l=(int_least32_t)lengths[i];
      newT.effL = lengths[i];
      transcripts.push_back(newT);
   }
   // Initialize gene info based on gene names.
   setGeneInfo();
   isInitialized = true;
   return isInitialized;
}//}}}
void TranscriptInfo::setGeneInfo(){//{{{
   // Cleanup previous gene list.
   genes.clear();
   // Map of genes: name -> position within gene vector.
   map<string,long> names;
   geneT tmpG;
   long gi=0,i;
   groupedByGenes = true;
   string previousName = "!-noname-!";
   for(i=0;i<M;i++){
      // If gene name same as previous, then just add new transcript.
      if(transcripts[i].g == previousName){
         transcripts[i].gI = gi;
         genes[gi].m++;
         genes[gi].trs.push_back(i);
      }else{
         previousName=transcripts[i].g;
         // Check whether the gene name is new or was seen before.
         if(names.count(transcripts[i].g) == 0){
            // Prepare entry for new gene, starting with one (current) transcript.
            tmpG.name = transcripts[i].g;
            tmpG.m = 1;
            tmpG.trs = vector<long>(1,i);
            // Add entry to the gene list.
            genes.push_back(tmpG);
            // Set current gene index.
            gi=genes.size()-1;
            transcripts[i].gI = gi;
            // Map gene name to it's index and update previousName.
            names[transcripts[i].g] = gi;
         }else{
            // If gene name was seen before then transcripts are not grouped by genes.
            groupedByGenes=false;
            //warning("TranscriptInfo: Transcripts of gene %ld are not grouped.\n",transcripts[i].g);
            gi = names[transcripts[i].g];
            transcripts[i].gI = gi;
            genes[gi].m++;
            genes[gi].trs.push_back(i);
         }
      }
   }
   G = genes.size();
   // Add empty record to the end.
   tmpG.name = "";
   tmpG.m = 0;
   tmpG.trs.clear();
   genes.push_back(tmpG);
}//}}}
TranscriptInfo::TranscriptInfo(){ clearTranscriptInfo(); }
void TranscriptInfo::clearTranscriptInfo(){//{{{
   M=G=0;
   isInitialized=false;
   groupedByGenes=true;
   transcripts.clear();
   genes.clear();
}//}}}
TranscriptInfo::TranscriptInfo(string fileName){//{{{
   noName="wrongID";
   // TranscriptInfo();
   readInfo(fileName);
}//}}}
bool TranscriptInfo::readInfo(string fileName){//{{{
   clearTranscriptInfo();
   ifstream trFile(fileName.c_str());
   if(!trFile.is_open()){
      error("TranscriptInfo: problem reading transcript file.\n");
      return false;
   }
   transcriptT newT;
   // Read all lines of file ignoring lines starting with #.
   while(trFile.good()){
      while(trFile.good() && (trFile.peek()=='#'))
         trFile.ignore(100000000,'\n');
      if(!trFile.good()) break;
      // Read gene name, tr name and length.
      trFile>>newT.g>>newT.t>>newT.l;
      newT.gI = 0;
      // Should not hit EOF or any other error yet.
      if(!trFile.good()) break;
      // Read effective length if present:
      while((trFile.peek() == '\t')||(trFile.peek() == ' ')) trFile.get();
      // If end of line is reached then use length as effective length.
      if((trFile.good()) && (trFile.peek() == '\n')) newT.effL = newT.l;
      else trFile>>newT.effL;
      // If the line was OK, then push new entry (EOF when looking for effective length is allowed).
      if(!trFile.fail())
         transcripts.push_back(newT);
      // Ignore rest of the line.
      trFile.ignore(100000000,'\n');
   }
   trFile.close();
   isInitialized = true;
   M = (long)transcripts.size();
   setGeneInfo();
   return isInitialized;
}//}}}
long TranscriptInfo::getM() const{//{{{
   return M;
}//}}}
long TranscriptInfo::getG() const{//{{{
   return G;
}//}}}
const vector<long> &TranscriptInfo::getGtrs(long i) const{//{{{
   if((i>G) || (i<0)){
      // Return empty record.
      return genes[G].trs;
   }
   return genes[i].trs;
}//}}}
double TranscriptInfo::effL(long i) const{//{{{
   if(isInitialized && (i<M))return transcripts[i].effL;
   return 0;
}//}}}
long TranscriptInfo::L(long i) const{//{{{
   if(isInitialized && (i<M))return transcripts[i].l;
   return 0;
}//}}}
const string &TranscriptInfo::trName(long i) const{//{{{
   if(isInitialized && (i<M))return transcripts[i].t;
   return noName;
}//}}}
const string &TranscriptInfo::geName(long i) const{//{{{
   if(isInitialized && (i<M))return transcripts[i].g;
   return noName;
}//}}}
long TranscriptInfo::geId(long i) const{//{{{
   if(isInitialized && (i<M))return transcripts[i].gI;
   return -1;
}//}}}
void TranscriptInfo::setEffectiveLength(vector<double> effL){//{{{
   if((long)effL.size() != M){
      warning("TranscriptInfo: Wrong array size for effective length adjustment.\n");
      return;
   }
   // Adjust effective length to similar scale as normal length
   double sumL = 0,sumN = 0,norm;
   for(long i=0;i<M;i++){
      sumN+=effL[i];
      sumL+=transcripts[i].l;
   }
// don't normalize
//   norm = sumL / sumN;
   norm = 1;
   for(long i=0;i<M;i++){
      transcripts[i].effL = effL[i] * norm;
   }
}//}}}
vector<double> *TranscriptInfo::getShiftedLengths(bool effective) const{//{{{
   vector<double> *Ls = new vector<double>(M+1);
   for(long i=0;i<M;i++){
      if(effective)(*Ls)[i+1] = transcripts[i].effL;
      else (*Ls)[i+1] = transcripts[i].l;
   }
   return Ls;
}//}}}
bool TranscriptInfo::updateTrNames(const vector<string> &trList){//{{{
   if((long)trList.size() != M)return false;
   // Check uniqueness of new names.
   set<string> trSet(trList.begin(),trList.end());
   if((long)trSet.size() != M)return false;
   for(long i=0;i<M;i++){
      transcripts[i].t = trList[i];
   }
   return true;
}//}}}
bool TranscriptInfo::updateGeneNames(const vector<string> &geneList){//{{{
   if((long)geneList.size() != M){
      warning("TranscriptInfo: Number of items in gene list (%ld) does not match number of transcripts (%ld).",geneList.size(),M);
      return false;
   }
   // Copy gene names in the order they are.
   for(long i=0;i<M;i++){
      transcripts[i].g = geneList[i];
   }
   // Initialize gene info.
   setGeneInfo();
   return true;
}//}}}
bool TranscriptInfo::updateGeneNames(const map<string,string> &trGeneList){//{{{
   if((long)trGeneList.size() < M){
      warning("TranscriptInfo: Number of items in TR->GE map (%ld) is less than the number of transcripts (%ld).",trGeneList.size(),M);
      return false;
   }
   // Check all transcripts have associated gene name.
   for(long i=0;i<M;i++){
      if(!trGeneList.count(transcripts[i].t)){
         warning("TranscriptInfo: No gene name for transcript [%s].",transcripts[i].t.c_str());
         return false;
      }
   }
   // Set gene names.
   for(long i=0;i<M;i++){
      transcripts[i].g = trGeneList.find(transcripts[i].t)->second;
   }
   // Initialize gene info.
   setGeneInfo();
   return true;
}//}}}