File: SoloFeature_loadRawMatrix.cpp

package info (click to toggle)
rna-star 2.7.11b%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,492 kB
  • sloc: cpp: 21,951; awk: 827; ansic: 457; makefile: 192; sh: 31
file content (141 lines) | stat: -rw-r--r-- 5,983 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include "SoloFeature.h"
#include "streamFuns.h"
#include "ErrorWarning.h"
#include "SoloFeatureTypes.h"
#include "serviceFuns.cpp"

void SoloFeature::loadRawMatrix()
{    
    //make directories
    if (P.runModeIn.size()<3) {
        string errOut = "Exiting because of fatal PARAMETER error: --runMode soloCellFiltering should contain paths to count matrix input directorry and output prefix.";
        errOut       += "\nSOLUTION: re-run with --runMode soloCellFiltering </path/to/raw/count/dir/> </path/to/output/prefix>\n";
        exitWithError(errOut, std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P);
    };
    
    string inputPrefix= P.runModeIn[1] + '/';
    outputPrefix= P.runModeIn[2];
    outputPrefixFiltered= outputPrefix;

    /////////////////////////////////////////////////////////////
    //load counting matrix
    string matrixFileName=inputPrefix+pSolo.outFileNames[3];
    ifstream &matStream=ifstrOpen(matrixFileName, ERROR_OUT, "SOLUTION: check path and permission for the matrix file" + matrixFileName, P);

    //header
    while (matStream.peek() == '%') {
            string dummy;
            matStream.ignore(numeric_limits<streamsize>::max(), '\n');
    };
    
    uint32 nCB1; //number of features read from file
    uint64 nTot; //total number of entries
    matStream >> featuresNumber >> nCB1 >> nTot;
    
    if (nTot==0) {
        exitWithError("Exiting because of fatal INPUT FILE error: no counts detected in " + matrixFileName + \
                      "\nSOLUTION: check the formatting of the matrix file.\n", \
                       std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P);
    };
    
    /* do not need it - read the information from features file
    if (nfeat1 != featuresNumber) {
        ostringstream errOut;
        errOut <<"Exiting because of fatal INPUT FILE error: number of features in "<< matrixFileName <<": "<< nfeat1;
        errOut <<"\n   is not equal to the number of features from annotations from genome index: " << featuresNumber << "\n";
        errOut <<"SOLUTION: use the same genome index that was use in the STARsolo run that generated the matrix\n"; 
        exitWithError(errOut.str(), std::cerr, P.inOut->logMain, EXIT_CODE_PARAMETER, P);
    };
    */
    
    //struct {uint32 gene; uint32 cb; uint32 count} matEntry;
    /*vector<array<uint32,3>> matSparse(nTot);
    for (uint64 ii=0; ii<nTot; ii++) {
        for (uint32 jj=0; jj<3; jj++)
            matStream >> matSparse[ii][jj];
    };

    //sort by cell index
    std::sort(matSparse.begin(), matSparse.end(), [](const array<uint32,3> &m1, const array<uint32,3> &m2) {
                                                        return m1[1] < m2[1] ;
                                                    });
    */
    
    countMatStride=3; //gene, cell, count. Recording cell at shift=1 is temporary: later will replace cell with count
    countCellGeneUMI.resize(nTot*countMatStride,0);
    
    //countMatMult.s = 3;
    //countMatMult.m.resize(nTot*countMatStride,0.0);
    
    for (uint64 ii=0; ii<nTot; ii++) {
        matStream >> countCellGeneUMI[ii*countMatStride+0];//gene
        --countCellGeneUMI[ii*countMatStride+0]; //0-based gene
        
        matStream >> countCellGeneUMI[ii*countMatStride+1];//cell
        --countCellGeneUMI[ii*countMatStride+1]; //0-based cell
        
        double count1;
        matStream >> count1;
        countCellGeneUMI[ii*countMatStride+2] = std::round(count1);
        
        /* to keep fractional part
        matStream >> countMatMult.m[ii*countMatStride+2];//count: for now, only allow one value per cell/gene 
        countCellGeneUMI[ii*countMatStride+2] = std::round(countMatMult.m[ii*countMatStride+2]); //round to integer
        countMatMult.m[ii*countMatStride+2] -= countCellGeneUMI[ii*countMatStride+2]; //just the fractional part
        countMatMult.m[ii*countMatStride+0] = countCellGeneUMI[ii*countMatStride+0];
        countMatMult.m[ii*countMatStride+1] = countCellGeneUMI[ii*countMatStride+1];
        */
    };
    
    qsort((void*) countCellGeneUMI.data(), nTot, countMatStride*sizeof(countCellGeneUMI[0]), funCompareTypeSecondFirst<uint32>);
    
    
    //count number of detected cell
    nCB=0;
    uint32 ciprev=(uint32) -1;
    for (uint32 ii=0; ii<nTot; ii++) {
        uint32 ci1=countCellGeneUMI[ii*countMatStride+1];
        if (ci1!=ciprev) {//new cell
            ciprev=ci1;
            nCB++;
        };
    };    
    
    indCB.resize(nCB);
    countCellGeneUMIindex.resize(nCB);
    nUMIperCB.resize(nCB,0);
    nGenePerCB.resize(nCB,0);
    nReadPerCB.resize(nCB,0);
    
    nCB=(uint32) -1;
    ciprev=(uint32) -1;
    for (uint32 ii=0; ii<nTot; ii++) {
        uint32 ci1 = countCellGeneUMI[ii*countMatStride+1];
        if (ci1 != ciprev) {//new cell
            ciprev = ci1;
            nCB++;
            indCB[nCB] = ci1;
            countCellGeneUMIindex[nCB] = ii*countMatStride;
        };
        nGenePerCB[nCB]++;
        nUMIperCB[nCB] += countCellGeneUMI[ii*countMatStride+2];
        countCellGeneUMI[ii*countMatStride+1]=countCellGeneUMI[ii*countMatStride+2];//replace cell with count to keep standard convention about countCellGeneUMI
    };
    
    {//load barcodes
        ifstream &wlstream = ifstrOpen(inputPrefix+pSolo.outFileNames[2], ERROR_OUT, "SOLUTION: check the path and permissions of the barcodes file", P);
        pSolo.cbWLstr.resize(nCB1);
        for (auto &cb: pSolo.cbWLstr)
            std::getline(wlstream, cb);
    };
    
    {//copy features
        std::ifstream &infeat  = ifstrOpen(inputPrefix + pSolo.outFileNames[1], ERROR_OUT, "SOLUTION: check the path and permissions of the features file", P);
        createDirectory(outputPrefixFiltered, P.runDirPerm, "Solo output directory", P);
        std::ofstream &outfeat = ofstrOpen(outputPrefixFiltered + pSolo.outFileNames[1], ERROR_OUT, P);
        outfeat << infeat.rdbuf();
        outfeat.close();
    };
    
    return;
};