File: SoloFeature_countSmartSeq.cpp

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (155 lines) | stat: -rw-r--r-- 5,971 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#include "SoloFeature.h"
#include "streamFuns.h"
#include "TimeFunctions.h"
#include "SequenceFuns.h"
#include "serviceFuns.cpp"
#include "soloInputFeatureUMI.h"


void SoloFeature::countSmartSeq()
{    
    time_t rawTime;
    
    //nCB=pSolo.cbWLsize; //all cells are recorded
    //nCB, indCB are recorded in sumThreads

    for (int ii=0; ii<P.runThreadN; ii++) {
        readFeatSum->addStats(*readFeatAll[ii]);
    };
    
    redistributeReadsByCB();
    
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) <<" ... Finished redistribution of reads from Solo read files"<<endl;   
    
    //////////////////////////////////////////////////////
    nReadPerCB.resize(nCB);
    cbFeatureUMImap.resize(nCB);
       
    typedef struct {
        uint32 feature;
        uint32 count[2];//[0]=NoDedup, [1]=Exact
    } typeFeatureCount;   
    
    vector<vector<typeFeatureCount>> vCellFeatureCount(nCB);
    
    typedef struct {
        uint32 feature;
        uint64 umi;
    } typeFeatureUMI;    
    vector<vector<typeFeatureUMI>::iterator> cbFeatUMI (nCB + 1);    
    
    #pragma omp parallel for num_threads(P.runThreadN)
    for (uint32 ired=0; ired<redistrFilesCBfirst.size()-1; ired++) {//TODO: parallelize, each ired is independent here!
        vector<typeFeatureUMI> vFeatureUMI (redistrFilesNreads[ired]);
        
        uint32 iCB1=redistrFilesCBfirst[ired];
        uint32 iCB2=redistrFilesCBfirst[ired+1];
        
        //allocate arrays    
        cbFeatUMI[iCB1]=vFeatureUMI.begin();
        for (uint32 icb=iCB1+1; icb<iCB2; icb++) {
            cbFeatUMI[icb] = cbFeatUMI[icb-1] + readFeatSum->cbReadCount[indCB[icb-1]];
        };        
        
        //input records
        redistrFilesStreams[ired]->flush();
        redistrFilesStreams[ired]->seekg(0,ios::beg);
        
        uint32 feature;
        uint64 umi, iread;
        int32 cbmatch;
        int64 cb;
        vector<uint32> trIdDist; //not used
        bool readInfoYes=false;
        while (soloInputFeatureUMI(redistrFilesStreams[ired], featureType, readInfoYes, P.sjAll, iread, cbmatch, feature, umi, trIdDist)) {//cycle over file records
            
            *redistrFilesStreams[ired] >> cb;
            
            if (feature+1 == 0) 
                continue;

            uint32 icb=indCBwl[cb];
            *( cbFeatUMI[icb] + nReadPerCB[icb] )={feature,umi};
            nReadPerCB[icb]++;
        };
        
        //collapse UMI, simple
        for (uint32 icb=iCB1; icb<iCB2; icb++) {
            if (nReadPerCB[icb]==0) {
                continue;
            };
            
            sort(cbFeatUMI[icb], cbFeatUMI[icb]+nReadPerCB[icb], 
                    [](const typeFeatureUMI &a, const typeFeatureUMI &b) 
                      {return (a.feature < b.feature) || ( a.feature == b.feature && a.umi < b.umi); });
                  
            vCellFeatureCount[icb].reserve(8192);
            vCellFeatureCount[icb].push_back({cbFeatUMI[icb]->feature, {1,1}});//first read
            for (auto fu=cbFeatUMI[icb]+1; fu!=cbFeatUMI[icb]+nReadPerCB[icb]; fu++) {//cycle over all reads for this icb
                if ( fu->feature != (fu-1)->feature ) {//compare to previous feature
                    vCellFeatureCount[icb].push_back({fu->feature, {1,1}});//create next feature entry
                } else {//same feature
                    vCellFeatureCount[icb].back().count[0]++; //non-collapsed UMI count   
                    
                    if ( fu->umi != (fu-1)->umi ) {//same feature, new umi
                        vCellFeatureCount[icb].back().count[1]++;//collapsed UMI count
                    };
                };
            };
        };
    };
    
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished reading / collapsing" <<endl;    
    
    //convert to countCellGeneUMI. TODO - this is not necessary, recode output to use vCellFeatureCount
    nUMIperCB.resize(nCB);
    nGenePerCB.resize(nCB);
      
    countMatStride = pSolo.umiDedup.yes.N + 1;
    
    uint64 ccgN=0;//total number of entries in the countCellGeneUMI
    for (auto &cbf :  vCellFeatureCount) {
        ccgN+=cbf.size();
    };
    countCellGeneUMI.resize(ccgN*countMatStride);
    countCellGeneUMIindex.resize(nCB+1);
    countCellGeneUMIindex[0]=0;
    
    for (uint32 icb=0; icb<nCB; icb++) {//copy vCellFeatureCount
                
        nGenePerCB[icb]=vCellFeatureCount[icb].size();
        countCellGeneUMIindex[icb+1] = countCellGeneUMIindex[icb] + nGenePerCB[icb]*countMatStride;
        
        for (uint64 ic=countCellGeneUMIindex[icb], ig=0; ic<countCellGeneUMIindex[icb+1]; ic+=countMatStride, ++ig) {//loop over genes
            
            countCellGeneUMI[ic + 0] = vCellFeatureCount[icb][ig].feature;
            
            if ( pSolo.umiDedup.yes.NoDedup )
                countCellGeneUMI[ic + pSolo.umiDedup.countInd.NoDedup] = vCellFeatureCount[icb][ig].count[0];
            if ( pSolo.umiDedup.yes.Exact )
                countCellGeneUMI[ic + pSolo.umiDedup.countInd.Exact] = vCellFeatureCount[icb][ig].count[1];//collapsed UMI count
                
            nUMIperCB[icb] += countCellGeneUMI[ic + pSolo.umiDedup.countInd.main];
        };
            
    };

    // sum stats
    uint32 nReadPerCBmax=0;
    for (uint32 icb=0; icb<nCB; icb++) {
        
        nReadPerCBmax=max(nReadPerCBmax,nReadPerCB[icb]);
        readFeatSum->stats.V[readFeatSum->stats.nMatch] += nReadPerCB[icb];
                
        readFeatSum->stats.V[readFeatSum->stats.nUMIs] += nUMIperCB[icb];
        if (nGenePerCB[icb]>0)
            ++readFeatSum->stats.V[readFeatSum->stats.nCellBarcodes];        
    };
    
    readFeatSum->stats.V[readFeatSum->stats.nExactMatch]=readFeatSum->stats.V[readFeatSum->stats.nMatch];            
    
    time(&rawTime);
    P.inOut->logMain << timeMonthDayTime(rawTime) << " ... Finished SmartSeq counting" <<endl;
};