1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
|
#include "SoloFeature.h"
#include "streamFuns.h"
#include "TimeFunctions.h"
#include "SequenceFuns.h"
#include "SoloCommon.h"
#include <unordered_map>
#include <bitset>
void SoloFeature::countVelocyto()
{//velocyto counting gets info from Gene counting
time_t rawTime;
nReadPerCB.resize(nCB);
vector<unordered_map<uintUMI,vector<trTypeStruct>>> cuTrTypes (nCB);
for (uint32 ii=0; ii<nCB; ii++)
cuTrTypes[ii].reserve(readFeatSum->cbReadCount[ii] > 100 ? readFeatSum->cbReadCount[ii] : readFeatSum->cbReadCount[ii]/5); //heuristics...
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Velocyto counting: allocated arrays" <<endl;
//////////// input records
for (int iThread=0; iThread<P.runThreadN; iThread++) {//TODO: this can be parallelized
fstream *streamReads = readFeatAll[iThread]->streamReads;
streamReads->flush();
streamReads->seekg(0,ios::beg);
uint64 iread;
while (*streamReads >> iread) {//until the end of file
uintCB cb=soloFeatAll[pSolo.featureInd[SoloFeatureTypes::Gene]]->readInfo[iread].cb;
uintUMI umi=soloFeatAll[pSolo.featureInd[SoloFeatureTypes::Gene]]->readInfo[iread].umi;
if (cb == (uintCB)-1 || umi == (uintUMI)-1 ) {//CB and/or UMI undefined. TODO: put a filter on CBs here, e.g. UMI threshold
streamReads->ignore((uint32)-1, '\n');
continue;
};
uint32 iCB=indCBwl[cb];
nReadPerCB[iCB]++;//simple estimate
if (cuTrTypes[iCB].count(umi)>0 && cuTrTypes[iCB][umi].empty()) {//intersection is empty, no need to load this transcript
streamReads->ignore((uint32)-1, '\n');
continue;
};
uint32 nTr;
*streamReads >> nTr;
vector<trTypeStruct> trT(nTr);
for (auto & tt: trT) {
uint32 ty;
*streamReads >> tt.tr >> ty;
tt.type=(uint8) ty;
};
if (cuTrTypes[iCB].count(umi)==0) {//1st entry for this umi
cuTrTypes[iCB][umi]=trT;
continue;
};
uint32 inew=0;
vector<trTypeStruct> trT1;
trT1.reserve(cuTrTypes[iCB][umi].size());
for (uint32 iold=0; iold<cuTrTypes[iCB][umi].size(); iold++) {//intersection of old with new
while (inew < trT.size() && cuTrTypes[iCB][umi][iold].tr>trT[inew].tr) //move through the sorted lists
++inew;
if (inew == trT.size() ) //end of trT reached
break;
if (cuTrTypes[iCB][umi][iold].tr == trT[inew].tr) {//
trT1.push_back({trT[inew].tr, (uint8)(cuTrTypes[iCB][umi][iold].type | trT[inew].type)});
};
};
cuTrTypes[iCB][umi]=trT1;//replace with intersection
};
};
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Velocyto counting: finished input" <<endl;
//////////////////////////////////////////////////////////////////////////////
/////////////////////////// counts for each CB
nUMIperCB.resize(nCB,0);
nGenePerCB.resize(nCB,0);
countMatStride=4; //spliced/unspliced/ambiguous
countCellGeneUMI.resize(nReadsMapped*countMatStride/5+16); //5 is heuristic, will be resized if needed
countCellGeneUMIindex.resize(nCB+1);
countCellGeneUMIindex[0]=0;
for (uint32 iCB=0; iCB<nCB; iCB++) {//main collapse cycle
map<uint32,array<uint32,3>> geneC;
for (auto &umi : cuTrTypes[iCB]) {//cycle over UMIs
if (umi.second.empty()) //no transcripts in the intesect
continue;
uint32 geneI=Trans.trGene[umi.second[0].tr];
//for each transcript, are all UMIs:
bool exonModel=false; //purely exonic
bool intronModel=false; //purely intronic
bool spanModel=true; //spanning model for all UMIs
bool mixedModel=false; //both intronic and exonic, but not spanning
for (auto &tt : umi.second) {//cycle over transcripts in this UMI
if (Trans.trGene[tt.tr] != geneI) {//multigene
geneI=(uint32)-1;
break;
};
bitset<velocytoTypeGeneBits> gV (tt.type);
mixedModel |= ((gV.test(AlignVsTranscript::Intron) && gV.test(AlignVsTranscript::Concordant)) || gV.test(AlignVsTranscript::ExonIntron)) && !gV.test(AlignVsTranscript::ExonIntronSpan);//has exon and intron, but no span
spanModel &= gV.test(AlignVsTranscript::ExonIntronSpan);
exonModel |= gV.test(AlignVsTranscript::Concordant) && !gV.test(AlignVsTranscript::Intron) && !gV.test(AlignVsTranscript::ExonIntron);//has only-exons model
intronModel |= gV.test(AlignVsTranscript::Intron) && !gV.test(AlignVsTranscript::ExonIntron) && !gV.test(AlignVsTranscript::Concordant);//has only-introns model, this includes span
};
if (geneI+1==0) //multigene
continue;
if (exonModel && !intronModel && !mixedModel) {// all models are only-exons
geneC[geneI][0]++; //spliced
} else if ( spanModel ||
( (intronModel || mixedModel) && !exonModel)
) {//all models are only-spans. Not sure if this ever happens, should be simplified. Intron or mixed, but not exon
geneC[geneI][1]++;//unspliced
} else {//all other combinations are mixed models, i.e. both only-exons and only-introns, only-exons and mixed
geneC[geneI][2]++;//ambiguous
};
nUMIperCB[iCB]++;
};
countCellGeneUMIindex[iCB+1] = countCellGeneUMIindex[iCB];
if (nUMIperCB[iCB]==0) //no UMIs counted for this CB
continue;
nGenePerCB[iCB]+=geneC.size();
readFeatSum->stats.V[readFeatSum->stats.nUMIs] += nUMIperCB[iCB];
++readFeatSum->stats.V[readFeatSum->stats.nCellBarcodes];
if (countCellGeneUMI.size() < countCellGeneUMIindex[iCB+1] + geneC.size()*countMatStride) //allocated vector too small
countCellGeneUMI.resize(countCellGeneUMI.size()*2);
for (auto & gg : geneC) {
countCellGeneUMI[countCellGeneUMIindex[iCB+1]+0]=gg.first;
for (uint32 ii=0;ii<3;ii++)
countCellGeneUMI[countCellGeneUMIindex[iCB+1]+1+ii]=gg.second[ii];
countCellGeneUMIindex[iCB+1] += countMatStride;
};
};
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Velocyto counting: finished collapsing UMIs" <<endl;
};
|