1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
|
# usage awk -v exactCells=... -v maxCells=... -v maxPercentile=... -v maxMinRatio=... -f soloBasicCellFilter.awk
# default values - if variables were not defined by the user
if (exactCells==0)
exactCells=0;
if (maxCells==0)
maxCells=3000;
if (maxPercentile==0)
maxPercentile=0.99;
if (maxMinRatio==0)
maxMinRatio=10;
print "Parameters: " "exactCells=" exactCells, "maxCells=" maxCells, "maxPercentile=", maxPercentile, "maxMinRatio=", maxMinRatio;
nHeaderLines=3;
fOutCB=ARGV[1] ".filtered";
fOutMat=ARGV[2] ".filtered";
}
{
if (ARGIND==1) {# read barcodes
CB[NR]=$1;
} else if (FNR<=nHeaderLines) {
A[FNR]=$0;
if (FNR==nHeaderLines) {
nGenes=$1;
};
} else {
cellG[FNR]=$1;
cellI[FNR]=$2;
cellN[FNR]=$3;
cellTot[$2]+=$3;
nLines=FNR;
};
}
END {
asort(cellTot,cellTotSort);
nMax=cellTotSort[-int((1-maxPercentile)*maxCells)+length(cellTot)];
nMin=nMax/maxMinRatio;
if (exactCells>0)
nMin=length(cellTot)<exactCells ? cellTotSort[1] : cellTotSort[length(cellTot)-exactCells];
nCell=0;
for (ii=1; ii<=length(CB); ii++) {
if (cellTot[ii]>=nMin) {
print CB[ii] > fOutCB;
nCell++;
cellInew[ii]=nCell;
print nCell,cellTot[ii] > fOutCB ".counts";
};
};
if (exactCells==0) {
print "maxUMIperCell=" cellTotSort[length(cellTotSort)]+0,"Robust maxUMIperCel=" nMax,"minUMIperCell=" nMin, "Filtered N cells=", nCell;
} else {
print "total N cells=" length(cellTot), "exactCells=" exactCells, "minUMIperCell=" nMin;
};
nMat=0;
for (ii=nHeaderLines+1; ii<=nLines; ii++) {
if (cellTot[cellI[ii]]>=nMin) {
nMat++;
};
};
for (ii=1;ii<nHeaderLines;ii++) {
print A[ii] > fOutMat;
};
print nGenes,nCell+0,nMat+0 > fOutMat;
for (ii=nHeaderLines+1; ii<=nLines; ii++) {
if (cellTot[cellI[ii]]>=nMin) {
print cellG[ii],cellInew[cellI[ii]],cellN[ii] > fOutMat;
};
};
};
|