1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
#include "SoloFeature.h"
#include "streamFuns.h"
#include "TimeFunctions.h"
#include "SequenceFuns.h"
#include "serviceFuns.cpp"
#include "soloInputFeatureUMI.h"
void SoloFeature::countSmartSeq()
{
time_t rawTime;
//nCB=pSolo.cbWLsize; //all cells are recorded
//nCB, indCB are recorded in sumThreads
for (int ii=0; ii<P.runThreadN; ii++) {
readFeatSum->addStats(*readFeatAll[ii]);
};
redistributeReadsByCB();
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... Finished redistribution of reads from Solo read files"<<endl;
//////////////////////////////////////////////////////
nReadPerCB.resize(nCB);
cbFeatureUMImap.resize(nCB);
typedef struct {
uint32 feature;
uint32 count[2];//[0]=NoDedup, [1]=Exact
} typeFeatureCount;
vector<vector<typeFeatureCount>> vCellFeatureCount(nCB);
typedef struct {
uint32 feature;
uint64 umi;
} typeFeatureUMI;
vector<vector<typeFeatureUMI>::iterator> cbFeatUMI (nCB + 1);
#pragma omp parallel for num_threads(P.runThreadN)
for (uint32 ired=0; ired<redistrFilesCBfirst.size()-1; ired++) {//TODO: parallelize, each ired is independent here!
vector<typeFeatureUMI> vFeatureUMI (redistrFilesNreads[ired]);
uint32 iCB1=redistrFilesCBfirst[ired];
uint32 iCB2=redistrFilesCBfirst[ired+1];
//allocate arrays
cbFeatUMI[iCB1]=vFeatureUMI.begin();
for (uint32 icb=iCB1+1; icb<iCB2; icb++) {
cbFeatUMI[icb] = cbFeatUMI[icb-1] + readFeatSum->cbReadCount[indCB[icb-1]];
};
//input records
redistrFilesStreams[ired]->flush();
redistrFilesStreams[ired]->seekg(0,ios::beg);
uint32 feature;
uint64 umi, iread;
int32 cbmatch;
int64 cb;
vector<uint32> trIdDist; //not used
bool readInfoYes=false;
while (soloInputFeatureUMI(redistrFilesStreams[ired], featureType, readInfoYes, P.sjAll, iread, cbmatch, feature, umi, trIdDist)) {//cycle over file records
*redistrFilesStreams[ired] >> cb;
if (feature+1 == 0)
continue;
uint32 icb=indCBwl[cb];
*( cbFeatUMI[icb] + nReadPerCB[icb] )={feature,umi};
nReadPerCB[icb]++;
};
//collapse UMI, simple
for (uint32 icb=iCB1; icb<iCB2; icb++) {
if (nReadPerCB[icb]==0) {
continue;
};
sort(cbFeatUMI[icb], cbFeatUMI[icb]+nReadPerCB[icb],
[](const typeFeatureUMI &a, const typeFeatureUMI &b)
{return (a.feature < b.feature) || ( a.feature == b.feature && a.umi < b.umi); });
vCellFeatureCount[icb].reserve(8192);
vCellFeatureCount[icb].push_back({cbFeatUMI[icb]->feature, {1,1}});//first read
for (auto fu=cbFeatUMI[icb]+1; fu!=cbFeatUMI[icb]+nReadPerCB[icb]; fu++) {//cycle over all reads for this icb
if ( fu->feature != (fu-1)->feature ) {//compare to previous feature
vCellFeatureCount[icb].push_back({fu->feature, {1,1}});//create next feature entry
} else {//same feature
vCellFeatureCount[icb].back().count[0]++; //non-collapsed UMI count
if ( fu->umi != (fu-1)->umi ) {//same feature, new umi
vCellFeatureCount[icb].back().count[1]++;//collapsed UMI count
};
};
};
};
};
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished reading / collapsing" <<endl;
//convert to countCellGeneUMI. TODO - this is not necessary, recode output to use vCellFeatureCount
nUMIperCB.resize(nCB);
nGenePerCB.resize(nCB);
countMatStride = pSolo.umiDedup.yes.N + 1;
uint64 ccgN=0;//total number of entries in the countCellGeneUMI
for (auto &cbf : vCellFeatureCount) {
ccgN+=cbf.size();
};
countCellGeneUMI.resize(ccgN*countMatStride);
countCellGeneUMIindex.resize(nCB+1);
countCellGeneUMIindex[0]=0;
for (uint32 icb=0; icb<nCB; icb++) {//copy vCellFeatureCount
nGenePerCB[icb]=vCellFeatureCount[icb].size();
countCellGeneUMIindex[icb+1] = countCellGeneUMIindex[icb] + nGenePerCB[icb]*countMatStride;
for (uint64 ic=countCellGeneUMIindex[icb], ig=0; ic<countCellGeneUMIindex[icb+1]; ic+=countMatStride, ++ig) {//loop over genes
countCellGeneUMI[ic + 0] = vCellFeatureCount[icb][ig].feature;
if ( pSolo.umiDedup.yes.NoDedup )
countCellGeneUMI[ic + pSolo.umiDedup.countInd.NoDedup] = vCellFeatureCount[icb][ig].count[0];
if ( pSolo.umiDedup.yes.Exact )
countCellGeneUMI[ic + pSolo.umiDedup.countInd.Exact] = vCellFeatureCount[icb][ig].count[1];//collapsed UMI count
nUMIperCB[icb] += countCellGeneUMI[ic + pSolo.umiDedup.countInd.main];
};
};
// sum stats
uint32 nReadPerCBmax=0;
for (uint32 icb=0; icb<nCB; icb++) {
nReadPerCBmax=max(nReadPerCBmax,nReadPerCB[icb]);
readFeatSum->stats.V[readFeatSum->stats.nMatch] += nReadPerCB[icb];
readFeatSum->stats.V[readFeatSum->stats.nUMIs] += nUMIperCB[icb];
if (nGenePerCB[icb]>0)
++readFeatSum->stats.V[readFeatSum->stats.nCellBarcodes];
};
readFeatSum->stats.V[readFeatSum->stats.nExactMatch]=readFeatSum->stats.V[readFeatSum->stats.nMatch];
time(&rawTime);
P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished SmartSeq counting" <<endl;
};
|