File: SoloFeature_countVelocyto.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (158 lines) | stat: -rw-r--r-- 7,075 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#include "SoloFeature.h"
#include "streamFuns.h"
#include "TimeFunctions.h"
#include "SequenceFuns.h"
#include "SoloCommon.h"
#include <unordered_map>
#include <bitset>

void SoloFeature::countVelocyto()
{//velocyto counting gets info from Gene counting
    time_t rawTime;

    nReadPerCB.resize(nCB);

    vector<unordered_map<uintUMI,vector<trTypeStruct>>> cuTrTypes (nCB);
    for (uint32 ii=0; ii<nCB; ii++)
        cuTrTypes[ii].reserve(readFeatSum->cbReadCount[ii] > 100 ? readFeatSum->cbReadCount[ii] : readFeatSum->cbReadCount[ii]/5); //heuristics...
    
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Velocyto counting: allocated arrays" <<endl;
    
    //////////// input records
    for (int iThread=0; iThread<P.runThreadN; iThread++) {//TODO: this can be parallelized
        fstream *streamReads = readFeatAll[iThread]->streamReads;
        streamReads->flush();
        streamReads->seekg(0,ios::beg);
        
        uint64 iread;
        while (*streamReads >> iread) {//until the end of file
            uintCB cb=soloFeatAll[pSolo.featureInd[SoloFeatureTypes::Gene]]->readInfo[iread].cb;
            uintUMI umi=soloFeatAll[pSolo.featureInd[SoloFeatureTypes::Gene]]->readInfo[iread].umi;
            if (cb == (uintCB)-1 || umi == (uintUMI)-1 ) {//CB and/or UMI undefined.   TODO: put a filter on CBs here, e.g. UMI threshold
                streamReads->ignore((uint32)-1, '\n');
                continue;
            };

            uint32 iCB=indCBwl[cb];
            nReadPerCB[iCB]++;//simple estimate
            
            if (cuTrTypes[iCB].count(umi)>0 && cuTrTypes[iCB][umi].empty()) {//intersection is empty, no need to load this transcript
                streamReads->ignore((uint32)-1, '\n');
                continue;
            };

            uint32 nTr;
            *streamReads >> nTr;
            vector<trTypeStruct> trT(nTr);
            for (auto & tt: trT) {
                uint32 ty;
                *streamReads >> tt.tr >> ty;
                tt.type=(uint8) ty;
            };

            if (cuTrTypes[iCB].count(umi)==0) {//1st entry for this umi
                cuTrTypes[iCB][umi]=trT;
                continue;
            };
            
            uint32 inew=0;
            vector<trTypeStruct> trT1;
            trT1.reserve(cuTrTypes[iCB][umi].size());
            
            for (uint32 iold=0; iold<cuTrTypes[iCB][umi].size(); iold++) {//intersection of old with new
                while (inew < trT.size() && cuTrTypes[iCB][umi][iold].tr>trT[inew].tr) //move through the sorted lists
                    ++inew;
                
                if (inew == trT.size() ) //end of trT reached
                    break;
                
                if (cuTrTypes[iCB][umi][iold].tr == trT[inew].tr) {//
                    trT1.push_back({trT[inew].tr, (uint8)(cuTrTypes[iCB][umi][iold].type | trT[inew].type)});
                };
            };
            cuTrTypes[iCB][umi]=trT1;//replace with intersection
        };        
    };   
    
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Velocyto counting: finished input" <<endl;

    
    //////////////////////////////////////////////////////////////////////////////
    /////////////////////////// counts for  each CB
    nUMIperCB.resize(nCB,0);
    nGenePerCB.resize(nCB,0);
       
    countMatStride=4; //spliced/unspliced/ambiguous
    countCellGeneUMI.resize(nReadsMapped*countMatStride/5+16); //5 is heuristic, will be resized if needed
    countCellGeneUMIindex.resize(nCB+1);
    countCellGeneUMIindex[0]=0;
    
    for (uint32 iCB=0; iCB<nCB; iCB++) {//main collapse cycle
        map<uint32,array<uint32,3>> geneC;
        for (auto &umi : cuTrTypes[iCB]) {//cycle over UMIs
            if (umi.second.empty()) //no transcripts in the intesect
                continue;
            uint32 geneI=Trans.trGene[umi.second[0].tr];
            
            //for each transcript, are all UMIs:
            bool exonModel=false;   //purely exonic
            bool intronModel=false; //purely intronic
            bool spanModel=true; //spanning model for all UMIs
            bool mixedModel=false;  //both intronic and exonic, but not spanning
            
            for (auto &tt : umi.second) {//cycle over transcripts in this UMI
                if (Trans.trGene[tt.tr] != geneI) {//multigene
                    geneI=(uint32)-1;
                    break;
                };
                
                bitset<velocytoTypeGeneBits> gV (tt.type);
                
                mixedModel  |= ((gV.test(AlignVsTranscript::Intron) && gV.test(AlignVsTranscript::Concordant)) || gV.test(AlignVsTranscript::ExonIntron)) && !gV.test(AlignVsTranscript::ExonIntronSpan);//has exon and intron, but no span
                spanModel   &= gV.test(AlignVsTranscript::ExonIntronSpan);
                exonModel   |= gV.test(AlignVsTranscript::Concordant) && !gV.test(AlignVsTranscript::Intron) && !gV.test(AlignVsTranscript::ExonIntron);//has only-exons model                
                intronModel |= gV.test(AlignVsTranscript::Intron) && !gV.test(AlignVsTranscript::ExonIntron) && !gV.test(AlignVsTranscript::Concordant);//has only-introns model, this includes span
            };
            
            if (geneI+1==0) //multigene
                continue;
            
            if (exonModel && !intronModel && !mixedModel) {// all models are only-exons
                geneC[geneI][0]++; //spliced 
            } else if ( spanModel ||
                        ( (intronModel || mixedModel) && !exonModel)
                      ) {//all models are only-spans. Not sure if this ever happens, should be simplified. Intron or mixed, but not exon
                geneC[geneI][1]++;//unspliced
            } else {//all other combinations are mixed models, i.e. both only-exons and only-introns, only-exons and mixed
                geneC[geneI][2]++;//ambiguous
            };
            
            nUMIperCB[iCB]++;
        };
        
        countCellGeneUMIindex[iCB+1] = countCellGeneUMIindex[iCB];        
        
        if (nUMIperCB[iCB]==0) //no UMIs counted for this CB
            continue;
        
        nGenePerCB[iCB]+=geneC.size();
        readFeatSum->stats.V[readFeatSum->stats.nUMIs] += nUMIperCB[iCB];
        ++readFeatSum->stats.V[readFeatSum->stats.nCellBarcodes];
        
        if (countCellGeneUMI.size() < countCellGeneUMIindex[iCB+1] + geneC.size()*countMatStride) //allocated vector too small
            countCellGeneUMI.resize(countCellGeneUMI.size()*2);        
        
        for (auto & gg : geneC) {
            countCellGeneUMI[countCellGeneUMIindex[iCB+1]+0]=gg.first;
            for (uint32 ii=0;ii<3;ii++)
                countCellGeneUMI[countCellGeneUMIindex[iCB+1]+1+ii]=gg.second[ii];
            countCellGeneUMIindex[iCB+1] += countMatStride;
        };
    };

    
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Velocyto counting: finished collapsing UMIs" <<endl;
};