1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
#!/bin/sh
# Execute bcftools on HG002, HG003 and HG004 samples. This could be done as
# a trio with pedegree information, but for now we just do them without
# additional data so we can evaluate calling in the presence of other samples.
#
# We have the GIAB benchmark files, but the HG002-4 BAMs need to be passed in
# as inputs as this may vary by instrument.
# Standard files, downloaded by the get_data.sh script.
# They can be overridden on the command line before running the script
# eg "BED=hard_regions.bed ./run_mpileup.sh [opts]"
# Note if in the shell you do: === () { /bin/true; }
# then you can cut and paste the === lines as the first part becomes a
# semi-colon capable comment.
HREF38=${HREF38:-/nfs/srpipe_references/references/Human/GRCh38_full_analysis_set_plus_decoy_hla/all/fasta/Homo_sapiens.GRCh38_full_analysis_set_plus_decoy_hla.fa}
TRUTH002=${TRUTH002:-HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz}
TRUTH003=${TRUTH003:-HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz}
TRUTH004=${TRUTH004:-HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz}
BED002=${BED002:-HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed}
BED003=${BED003:-HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed}
BED004=${BED004:-HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed}
bcftools=${BCFTOOLS:-bcftools}
BENCHDIR=${BENCHDIR:-.}
if [ $# -lt 5 ]
then
echo Usage: run_multi.sh HG002.bam HG003.bam HG004.bam region outdir [bcftools-arguments]
echo
echo Internal variables may be overridden on the command line, eg
echo BCFTOOLS=~/bcftools/bcftools.myPR ./run_multi.sh [args]
echo
echo Current settings:
echo BCFTOOLS=${bcftools}
echo HREF38=${HREF38}
echo TRUTH002=${TRUTH002}
echo TRUTH003=${TRUTH003}
echo TRUTH004=${TRUTH004}
echo BED002=${BED002}
echo BED003=${BED003}
echo BED004=${BED004}
echo BENCHDIR=${BENCHDIR}
exit 1
fi
file2=$1
file3=$2
file4=$3
region=$4
dir=$5
shift 5
args=${@+"$@"}
mkdir -p $dir
# Change to "if false" if we wish to adjust the plots only without recalling.
if true
then
# Do the actual variant calling
echo "===; Running $bcftools mpileup $args -a AD --fasta-ref $HREF38 -r $region $file2 $file3 $file4 | bcftools call -vm -"
eval $bcftools mpileup $args -a AD --fasta-ref $HREF38 -r $region $file2 $file3 $file4 2>$dir/bcftools.mpileup.out | $bcftools call -vm - > $dir/bcftools.vcf 2>$dir/bcftools.call.out
# Split into the input samples
echo
echo "===; $bcftools +split $dir/bcftools.vcf -o $dir"
eval $bcftools +split $dir/bcftools.vcf -o $dir
set -- `$bcftools view -h $dir/bcftools.vcf|tail -1|cut -f 10-`
vcf2=$1.vcf
vcf3=$2.vcf
vcf4=$3.vcf
# Remove non-sample data
echo
echo "===; $bcftools norm -m -both -f $HREF38 $dir/$vcf2 | grep -v '[.0]/[.0]:' > $dir/_$vcf2"
eval $bcftools norm -m -both -f $HREF38 $dir/$vcf2 | grep -v '[.0]/[.0]:' > $dir/_$vcf2
echo "===; $bcftools norm -m -both -f $HREF38 $dir/$vcf3 | grep -v '[.0]/[.0]:' > $dir/_$vcf3"
eval $bcftools norm -m -both -f $HREF38 $dir/$vcf3 | grep -v '[.0]/[.0]:' > $dir/_$vcf3
echo "===; $bcftools norm -m -both -f $HREF38 $dir/$vcf4 | grep -v '[.0]/[.0]:' > $dir/_$vcf4"
eval $bcftools norm -m -both -f $HREF38 $dir/$vcf4 | grep -v '[.0]/[.0]:' > $dir/_$vcf4
# A primary evaluation.
# They key thing here is it leaves behind the .isec directory with the
# intersection of the truth and call sets.
echo
echo "===; QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH002 $dir/_$vcf2 '' $BED002 $region"
QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH002 $dir/_$vcf2 "" $BED002 $region
echo "===; QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH003 $dir/_$vcf3 '' $BED003 $region"
QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH003 $dir/_$vcf3 "" $BED003 $region
echo "===; QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH004 $dir/_$vcf4 '' $BED004 $region"
QUAL=30 NORM=1 ${BENCHDIR}/compare_vcf_simple.sh $TRUTH004 $dir/_$vcf4 "" $BED004 $region
# Produce a .plot file for use in gnuplot, along with a basic summary too.
echo "===; ${BENCHDIR}/plot_isec.pl $dir/_$vcf2.isec indel > $dir/2.all"
${BENCHDIR}/plot_isec.pl $dir/_$vcf2.isec indel > $dir/2.out
grep ALL $dir/2.out > $dir/2.all
grep INS $dir/2.out > $dir/2.ins
grep DEL $dir/2.out > $dir/2.del
awk 'BEGIN {n=0} $6 >= n {print;n=50*(1+int($6/50))}' $dir/2.all | cut -c 1-28|head -20
echo "===; ${BENCHDIR}/plot_isec.pl $dir/_$vcf3.isec indel > $dir/3.all"
${BENCHDIR}/plot_isec.pl $dir/_$vcf3.isec indel > $dir/3.out
grep ALL $dir/3.out > $dir/3.all
grep INS $dir/3.out > $dir/3.ins
grep DEL $dir/3.out > $dir/3.del
awk 'BEGIN {n=0} $6 >= n {print;n=50*(1+int($6/50))}' $dir/3.all | cut -c 1-28|head -20
echo "===; ${BENCHDIR}/plot_isec.pl $dir/_$vcf4.isec indel > $dir/4.all"
${BENCHDIR}/plot_isec.pl $dir/_$vcf4.isec indel > $dir/4.out
grep ALL $dir/4.out > $dir/4.all
grep INS $dir/4.out > $dir/4.ins
grep DEL $dir/4.out > $dir/4.del
awk 'BEGIN {n=0} $6 >= n {print;n=50*(1+int($6/50))}' $dir/4.all | cut -c 1-28|head -20
else
# Short cut to replot results without rerunning the calling
set -- `$bcftools view -h $dir/bcftools.vcf|tail -1|cut -f 10-`
vcf2=$1.vcf
vcf3=$2.vcf
vcf4=$3.vcf
fi
# Generate GNU plots
echo === Running gnuplot to create "HG00[234]*.png"
gnuplot <<EOF
set xlabel "FPr"
set ylabel "FNr"
set yrange [0:10]
set title "indels"
set terminal png size 800,600
set output "$dir/HG002_FP.png"
a=8;b=10;plot "$dir/2.all" using a:b with linespoints title "HG002-FP vs FN" lw 2 ps .6,
set xlabel "GTr"
set output "$dir/HG002_GT.png"
a=9;b=10;plot "$dir/2.all" using a:b with linespoints title "HG002-GT vs FN" lw 2 ps .6,
EOF
gnuplot <<EOF
set xlabel "FPr"
set ylabel "FNr"
set yrange [0:10]
set title "indels"
set terminal png size 800,600
set output "$dir/HG003_FP.png"
a=8;b=10;plot "$dir/3.all" using a:b with linespoints title "HG003-FP vs FN" lw 2 ps .6,
set xlabel "GTr"
set output "$dir/HG003_GT.png"
a=9;b=10;plot "$dir/3.all" using a:b with linespoints title "HG003-GT vs FN" lw 2 ps .6,
EOF
gnuplot <<EOF
set xlabel "FPr"
set ylabel "FNr"
set yrange [0:10]
set title "indels"
set terminal png size 800,600
set output "$dir/HG004_FP.png"
a=8;b=10;plot "$dir/4.all" using a:b with linespoints title "HG004-FP vs FN" lw 2 ps .6,
set xlabel "GTr"
set output "$dir/HG004_GT.png"
a=9;b=10;plot "$dir/4.all" using a:b with linespoints title "HG004-GT vs FN" lw 2 ps .6,
EOF
# Example gnuplot:
# set xlabel "FPr"
# set ylabel "FNr"
# set yrange [0:10]
# set title "PacBio 50x indels"
# FP vs FN
# a=8;b=10;t="ALL";plot "<grep ".t." dev/plot" using a:b with linespoints title "dev" lw 2 ps .6, "<grep ".t." cns/plot" using a:b with linespoints title "indels-cns" lw 2 ps .6
# GT vs FN
# a=9;b=10; ... as above
|