File: soloBasicCellFilter.awk

package info (click to toggle)
rna-star 2.7.8a%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,076 kB
  • sloc: cpp: 20,429; awk: 483; ansic: 470; makefile: 181; sh: 31
file content (78 lines) | stat: -rw-r--r-- 2,160 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# usage awk -v exactCells=... -v maxCells=... -v maxPercentile=... -v maxMinRatio=... -f soloBasicCellFilter.awk 
    # default values - if variables were not defined by the user
    if (exactCells==0)
        exactCells=0;
    if (maxCells==0)
        maxCells=3000;
    if (maxPercentile==0)
        maxPercentile=0.99;
    if (maxMinRatio==0)
        maxMinRatio=10;

    print "Parameters: " "exactCells=" exactCells, "maxCells=" maxCells, "maxPercentile=", maxPercentile, "maxMinRatio=", maxMinRatio;

    nHeaderLines=3;
    fOutCB=ARGV[1] ".filtered";
    fOutMat=ARGV[2] ".filtered";
}

{
    if (ARGIND==1) {# read barcodes
        CB[NR]=$1;
    } else if (FNR<=nHeaderLines) {
         A[FNR]=$0;
         if (FNR==nHeaderLines) {
             nGenes=$1;
         };
    } else {
         cellG[FNR]=$1;
         cellI[FNR]=$2;
         cellN[FNR]=$3;
         cellTot[$2]+=$3;
         nLines=FNR;
    };
}
END {
    asort(cellTot,cellTotSort);
    nMax=cellTotSort[-int((1-maxPercentile)*maxCells)+length(cellTot)];
    nMin=nMax/maxMinRatio;

    if (exactCells>0)
        nMin=length(cellTot)<exactCells ? cellTotSort[1] : cellTotSort[length(cellTot)-exactCells];

    nCell=0;
    for (ii=1; ii<=length(CB); ii++) {
        if (cellTot[ii]>=nMin) {
            print CB[ii] > fOutCB;
            nCell++;
            cellInew[ii]=nCell;
            print nCell,cellTot[ii] > fOutCB ".counts";
        };
    };

    if (exactCells==0) {
        print "maxUMIperCell=" cellTotSort[length(cellTotSort)]+0,"Robust maxUMIperCel=" nMax,"minUMIperCell=" nMin, "Filtered N cells=", nCell;
    } else {
        print "total N cells=" length(cellTot), "exactCells=" exactCells, "minUMIperCell=" nMin;
    };

    nMat=0;
    for (ii=nHeaderLines+1; ii<=nLines; ii++) {
        if (cellTot[cellI[ii]]>=nMin) {
            nMat++;
        };
    };
    
    for (ii=1;ii<nHeaderLines;ii++) {
        print A[ii] > fOutMat;
    }; 

    print nGenes,nCell+0,nMat+0 > fOutMat;

    for (ii=nHeaderLines+1; ii<=nLines; ii++) {
        if (cellTot[cellI[ii]]>=nMin) {
            print cellG[ii],cellInew[cellI[ii]],cellN[ii] > fOutMat;
        };
    };

};