File: SoloFeature_quantTranscript.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (327 lines) | stat: -rwxr-xr-x 14,784 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327

#include "SoloFeature.h"
#include "streamFuns.h"
#include "TimeFunctions.h"
#include "SequenceFuns.h"
#include "SoloCommon.h"
#include <unordered_map>
#include <bitset>
#include "serviceFuns.cpp"
#include <math.h>       /* log */

void SoloFeature::quantTranscript()
{//velocyto counting gets info from Gene counting
    time_t rawTime;

    if (pSolo.clusterCBfile=="-")
        return;//no transcript quantification w/o cluster file
    
    std::unordered_map<uint32,uint32> clusterCBind; //for each CB - cluster index of detected CB
    std::set<uint32> clusterInd; //cluster index for each cluster - the integer from clusterCBfile 
    {//load cluster information
        ifstream &clusterStream = ifstrOpen(pSolo.clusterCBfile, ERROR_OUT, "SOLUTION: check the path and permissions of the cluster CB file: " + pSolo.clusterCBfile, P);
        string seq1;
        while (clusterStream >> seq1) {
            uint32 icl1;
            clusterStream >> icl1;
            uint64 cb1;
            if (convertNuclStrToInt64(seq1,cb1)) {//convert to 2-bit format
                auto cb1it=std::equal_range(pSolo.cbWL.begin(), pSolo.cbWL.end(), cb1); //find iterator in WL matching cb1
                uint32 cb1ind=(uint32) (cb1it.first-pSolo.cbWL.begin()); //substract WL.begin iterator to find index in WL
                if (cb1ind < pSolo.cbWL.size()) {//otherwise cb1 is not in WL
                    clusterCBind[cb1ind]=icl1; //map: key=CB, value=cluster index
                    clusterInd.emplace(icl1);  //ordered set of cluster indexes
                } else {
                    P.inOut->logMain << "WARNING: cluster CB sequence not present in whitelist and is ignored: " << seq1 <<endl;
                };
            } else {
                P.inOut->logMain << "WARNING: cluster CB sequence contains non-ACGT base and is ignored: " << seq1 <<endl;
            };        
        };
    };
    
    auto &trDistCount=readFeatSum->transcriptDistCount; //transcriptDistCount is accumulated while mapping, from reads that map uniquely
    vector<double> trDistFun(trDistCount.size(),0.0);
    vector<double> trDistFunTrFactor(Trans.nTr,0.0);
    
    {//process distance distribution function
        //running average
        int32 runAverN=50, runAverStart=0;
        //for (uint32 ii=0; ii<runAverN; ii++)
        //    trDistFun[0] += trDistCount[ii];
        for (int32 ii=0; ii<runAverStart; ii++)
            trDistFun[ii] = (double) trDistCount[ii];
        for (int32 ii=runAverStart; ii<(int32)trDistCount.size()-runAverN-1; ii++) {
            trDistFun[ii] = (double) std::accumulate(trDistCount.begin()+max(runAverStart,ii-runAverN), trDistCount.begin()+ii+runAverN+1, 0) / min(2*runAverN+1, ii-runAverStart+runAverN);
        };
        
        //cut when becomes non-monotonic
        uint32 imax=1000;
        while (trDistFun[imax+1]>trDistFun[imax])
            imax++; //find maximum going forward from imax=1000
        P.inOut->logMain << "SoloQuant: distance distribution past maximum = " << imax <<endl;
        
        while (trDistFun[imax+1]<trDistFun[imax])
            imax++; //find first minimum after the maximum found above
        P.inOut->logMain << "SoloQuant: distance distribution cutoff = " << imax <<endl;
        
        trDistFun.resize(imax);
        
        //normalize
        double norm1 = std::accumulate(trDistFun.begin(), trDistFun.end(), 0.0);
        
        ofstream *streamTrDistFun = &ofstrOpen(outputPrefix+"transcriptEndDistanceDistribution.txt",ERROR_OUT, P);
        for (auto & ff : trDistFun) {
            ff = ff/norm1;
            *streamTrDistFun << ff <<'\n';
        };
        streamTrDistFun->close();
        
        //normalization factors for all transcripts
        vector<double> trDistFunCum(trDistFun);        
        std::partial_sum(trDistFun.begin(), trDistFun.end(), trDistFunCum.begin());
        for (uint32 ii=0; ii<Trans.nTr; ii++)
            if (Trans.trLen[ii]<trDistFunCum.size())
                trDistFunTrFactor[ii]=-std::log(trDistFunCum[Trans.trLen[ii]-1]);
        
        for (auto & ff : trDistFun)
            ff = std::log(ff); //now trDistFun is log of dist function
        
    };
    
    typedef struct {
        uint32 tr;
        double d;
    } transcriptDistProbStruct;
    //key=cluster index, key=CB/umi, value=vector of <trID, sum{trDistFun[distance3p]}> for each read with the same umi
    map<uint32, unordered_map<uint64,vector<transcriptDistProbStruct>>> mapTrDist;
    
    //////////// input records
    for (int iThread=0; iThread<P.runThreadN; iThread++) {//TODO: this can be parallelized
        fstream *streamReads = readFeatAll[iThread]->streamReads;
        streamReads->flush();
        streamReads->seekg(0,ios::beg);    
        
        string line1;
        while (std::getline(*streamReads, line1)) {//until the end of file
            stringstream lineStream(line1);
            uint32 cb, cbCl, nTr;
            uint64 umi;
            lineStream >> cb >> umi >> nTr;

            if (clusterCBind.count(cb)==0) //this cb is not in the clusters
                continue;
            
            umi += (((uint64)cb)<<32); //to make UMI from different CB distinguishable
            
            cbCl=clusterCBind[cb];
                   
            vector<transcriptDistProbStruct> tD;
            tD.reserve(nTr);
            for (uint32 ii=0; ii<nTr; ii++) {
                uint32 tr1,d1;
                lineStream >> tr1 >> d1;
                if (d1>=trDistFun.size())
                    continue; //do not record such outlier
                
                tD.push_back( {tr1, trDistFun[d1] + trDistFunTrFactor[tr1]} );
            }; 
            
            if (tD.size()==0)
                continue;
                        
            std::sort(tD.begin(), tD.end(), [](const transcriptDistProbStruct &t1, const transcriptDistProbStruct &t2) {
                                                    return (t1.tr<t2.tr);
                                              }            
                     );
            
            if (mapTrDist[cbCl].count(umi)==0) {//1st entry for this umi
                mapTrDist[cbCl][umi]=tD;
                continue;
            };

            uint32 inew=0;
            vector<transcriptDistProbStruct> tD1;
            tD1.reserve(mapTrDist[cbCl][umi].size());

            for (uint32 iold=0; iold<mapTrDist[cbCl][umi].size(); iold++) {//intersection of old with new
                while (inew < tD.size() && mapTrDist[cbCl][umi][iold].tr>tD[inew].tr) //move through the sorted lists
                    ++inew; //inew advances if old_trID>new_trID

                if (inew == tD.size() ) //end of tD reached
                    break;

                if (mapTrDist[cbCl][umi][iold].tr == tD[inew].tr) {//found match old_trID==new_trID 
                    tD1.push_back({  tD[inew].tr, mapTrDist[cbCl][umi][iold].d + tD[inew].d  });//add new log probability to the existing one
                };
            };
            mapTrDist[cbCl][umi]=tD1;//replace with intersection
        };        
    };

    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Transcript3p counting: finished input" <<endl;            
 
    map<uint32, vector<double>> clusterExpression; //per cluster, relative abundance
    //TODO parallelize this loop
    for (auto & mapTrDist1 : mapTrDist) {//loop over cell clusters
        auto &clTrDist=mapTrDist1.second;
        
        vector<double> trUnique(Trans.nTr,0), trInitial(Trans.nTr,0);//counts of unique read for each transcripts (i.e. reads that map uniquely only to this transcript)
        uint64 nUMItot=0, nUMI0=0, nUMI1=0;
        
        {//pre-process mapTrDist: 
            auto clTrDist1=clTrDist.begin();
            while ( clTrDist1!=clTrDist.end() ) {//loop over UMIs
                auto &trDist=clTrDist1->second;
                
                if (trDist.size()==0) {
                    clTrDist1=clTrDist.erase(clTrDist1);
                    nUMI0++;//these are cases where the intersection of transcript from different reads with the same UMI is *empty*
                    continue;
                } else if (trDist.size()==1) {
                    trUnique[trDist[0].tr]++;
                    trInitial[trDist[0].tr] += 1.0;
                    clTrDist1=clTrDist.erase(clTrDist1);
                    nUMI1++;//only one transcript in the intersection, i.e. unique mappers
                    nUMItot++;
                    continue;
                };
                
                double max1= std::max_element(trDist.begin(), trDist.end(), [](const transcriptDistProbStruct &t1, const transcriptDistProbStruct &t2) {
                                                                                   return (t1.d<t2.d);
                                                                                 }) -> d; //finds maximum d
                for (auto & tt : trDist) {//loop over transcripts in the UMI
                    trInitial[tt.tr] += 1.0/trDist.size();//for initialization, split each count between multimappers evenly and add it to unique mappers
                    tt.d =std::exp(tt.d-max1);//from sum(log(prob)) to product(prob), dividing by the max element to avoid underflow
                };                            //this constant factor does not matter, since only the ratios of tt.d matter
                
                nUMItot++;
                clTrDist1++;
            };
            P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Transcript3p counting: cluster " << mapTrDist1.first <<" nUMItot="<<nUMItot<<" nUMI0="<<nUMI0<<" nUMI1="<<nUMI1 <<endl; 
        };
        
//         double trExpressed=0;
//         for (auto & tt : trInitial)
//             trExpressed += tt;
//         
//         for (auto & tt : trInitial)
//             tt *= nUMItot/trExpressed;
        
        vector<double> thOldNew[2];
        thOldNew[0]=trInitial;
        thOldNew[1].resize(Trans.nTr,0);
        
        auto *thOldP = &thOldNew[0];
        auto *thNewP = &thOldNew[1];
        
        vector<bool> trConverged(trInitial.size(), false); //if true, this transcript is converged - then it's not updated in EM
        
        for (uint32 iteration=0; iteration<10000; iteration++) {//main EM loop
            auto &thNew = *thNewP;
            auto &thOld = *thOldP;

            //always start with trUnique counts
            std::copy(trUnique.begin(), trUnique.end(), thNew.begin()); 
            
            //calculate thNew
            for (auto & clTrDist1: clTrDist) {//loop over all UMIs
                auto &trDist=clTrDist1.second;
                
                double denom1=0.0;
                for  (auto & td : trDist) {//loop over transcripts in one UMI: caclulate denomiator
                    denom1 += td.d * thOld[td.tr];
                };
                
                for  (auto & td : trDist) {//loop over transcripts in one UMI: update thNew
                    if (!trConverged[td.tr])
                        thNew[td.tr] += td.d * thOld[td.tr] / denom1; //adding to unique counts
                };
            };
            
            //convergence check
            double diffThresholdMax=1e-5;
            double diffThresholdOne=diffThresholdMax*0.1;
            double exprThreshold=1e-8*nUMItot;
            double diffMax=0, diffSum=0, aboveThrN=0, aboveThrExprSum=0, aboveThrOneN=0 ;
            for (uint32 itr=0; itr<thNew.size(); itr++) {
                if (trConverged[itr] || thOld[itr]==0)
                    continue;
                
                double diff1 = std::abs(thNew[itr]-thOld[itr])/thOld[itr];
                diffSum += diff1;
                diffMax = max(diffMax,diff1);
                if (diff1 > diffThresholdMax) {
                    aboveThrN++;
                    aboveThrExprSum += thNew[itr];
                };
                
                if (thNew[itr]<exprThreshold) {
                    trConverged[itr]=true;
                    //trUnique[itr]=thNew[itr];//0 here could make more sense, but convergence suffers
                    trUnique[itr]=0;
                };
                if (diff1<diffThresholdOne) {
                    trConverged[itr]=true;
                    trUnique[itr]=thNew[itr]; //this is the final value for this transcript
                } else {
                    aboveThrOneN++;
                };
            };
            
            cout <<iteration <<" "<<diffMax<<" "<<diffSum<<" "<<aboveThrN<<" "<<aboveThrExprSum<<" "<<aboveThrOneN<<endl;
            
            if (diffMax<diffThresholdMax)
                break;
            swap(thNewP,thOldP); //swap for the next iteration
        };
        
        auto &thOut=*thNewP;
        {//renormalize into TPMs
            double norm1=0;
            for (uint32 itr=0; itr<thOut.size(); itr++) {
                 thOut[itr] *= std::exp(trDistFunTrFactor[itr]);
                 norm1 += thOut[itr];
            };
            norm1=nUMItot/norm1;
            for (uint32 itr=0; itr<thOut.size(); itr++) {
                 thOut[itr] *= norm1;
            };
        };
        
        clusterExpression[mapTrDist1.first]=thOut;
        
        time(&rawTime);
        P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Transcript3p counting: finished cluster" << mapTrDist1.first <<endl;            
    };
        
    {//output counting matrix
        string matrixFileName=outputPrefix+pSolo.outFileNames[3];
        ofstream &countMatrixStream=ofstrOpen(matrixFileName,ERROR_OUT, P);
        countMatrixStream <<"%%MatrixMarket matrix coordinate real general\n%\n";
        
        uint32 nCellGeneEntries = 0;
        for (auto & ctpm : clusterExpression)
            for (auto & tt : ctpm.second)
                if (tt>0)
                    nCellGeneEntries++;
        
        countMatrixStream << Trans.nTr <<' '<< *std::max_element(clusterInd.begin(),clusterInd.end()) <<' '<< nCellGeneEntries << '\n';

        for (auto & ctpm : clusterExpression)
            for (auto itt=ctpm.second.begin(); itt!=ctpm.second.end(); itt++)
                if (*itt>0)
                    countMatrixStream << itt-ctpm.second.begin()+1  <<' '<<  ctpm.first  <<' '<< *itt << '\n';

        countMatrixStream.flush();
    };
    
    ofstream &outStr=ofstrOpen(outputPrefix+"/features.tsv",ERROR_OUT, P);        
    for (uint32 ii=0; ii<Trans.nTr; ii++)
        outStr << Trans.trID[ii] <<"\t"<< Trans.trLen[ii] <<"\t"<< Trans.geName[Trans.trGene[ii]] << '\n';
    outStr.close();
    
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Transcript3p counting: finished transcript quantification" <<endl;
};