1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
|
#!/bin/bash
if [ $# -ne 13 ];
then
echo "usage: "$(basename $0) "[output-dir] [fasta-ref] [vg-ref] [vg-pan] [hap0-base] [hap1-base] [sim-base] [sim-ref] [threads] [sim-read-spec] [sim-seed] [bp-threshold] [vg-map-opts]"
echo "example: "$(basename $0) 'SGRP2/SGD_2010.fasta SGRP2/SGRP2-cerevisiae.pathonly SGRP2/SGRP2-cerevisiae SGRP2/SGRP2-cerevisiae BC187-haps0 BC187-haps1 BC187 BC187.ref 4 "-n 50000 -e 0.01 -i 0.002 -l 150 -p 500 -v 50" 27 150 "-u 16"'
exit
fi
output=$1
fasta=$2
ref=$3
pan=$4
haps1=$5
haps2=$6
sim=$7
sim_ref=$8
threads=$9
read_spec=${10}
seed=${11}
threshold=${12}
vg_map_opts=${13}
pan_xg=$pan.xg
pan_gcsa=$pan.gcsa
ref_xg=$ref.xg
ref_gcsa=$ref.gcsa
haps1_xg=$haps1.xg
haps2_xg=$haps2.xg
sim_xg=$sim.xg
sim_ref_xg=$sim_ref.xg
echo $sim_xg $ref_xg $pan_xg
mkdir -p $output
# Get the vg id
id=$(vg version | cut -f 3 -d- | tail -c 8 | head -c 7)
echo testing vg-$id
# generate the simulated reads if we haven't already
if [ ! -e $output/sim.gam.truth.tsv ];
then
echo generating simulated reads
# -s 271 -n $num_reads -e 0.01 -i 0.002 -l 150 -p 500 -v 50
vg sim $read_spec -s $seed -x $haps1_xg -a >$output/sim1.gam &
vg sim $read_spec -s $(echo "$seed + 1" | bc) -x $haps2_xg -a >$output/sim2.gam &
wait
cat $output/sim1.gam $output/sim2.gam >$output/sim.gam
rm -f $output/sim1.gam $output/sim2.gam
vg annotate -p -x $sim_xg -a $output/sim.gam | vg view -a - | jq -c -r '[ .name, .refpos[0].name, .refpos[0].offset ] | @tsv' | pv -l | sort >$output/truth.tsv
vg annotate -n -x $sim_ref_xg -a $output/sim.gam | tail -n+2 | pv -l | sort >$output/novelty.tsv
join $output/truth.tsv $output/novelty.tsv >$output/sim.gam.truth.tsv
# split the file into the mates
vg view -X $output/sim.gam | gzip >$output/sim.fq.gz &
vg view -a $output/sim.gam | jq -cr 'select(.name | test("_1$"))' | vg view -JaG - | vg view -X - | sed s/_1$// | gzip >$output/sim_1.fq.gz &
vg view -a $output/sim.gam | jq -cr 'select(.name | test("_2$"))' | vg view -JaG - | vg view -X - | sed s/_2$// | gzip >$output/sim_2.fq.gz &
wait
fi
# This can then be mapped six ways.
# By bwa:
echo bwa mem paired mapping
time bwa mem -t $threads $fasta $output/sim_1.fq.gz $output/sim_2.fq.gz | grep -v ^@ | perl -ne '@val = split("\t", $_); print @val[0] . "_" . (@val[1] & 64 ? "1" : @val[1] & 128 ? "2" : "?"), "\t" . @val[2] . "\t" . (@val[3] + int(length(@val[9]) / 2)) . "\t" . @val[4] . "\t" . @val[13] . "\n";' | sed s/AS:i:// | pv -l | sort >$output/bwa_mem-pe.pos
join $output/bwa_mem-pe.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/bwa-pe.compare
# map single end
echo bwa mem single mapping
time bwa mem -t $threads $fasta $output/sim.fq.gz | grep -v ^@ | pv -l | awk -v OFS="\t" '{$4=($4 + int(length($10) / 2)); print}' | cut -f 1,3,4,5,14 | sed s/AS:i:// | sort >$output/bwa_mem-se.pos
join $output/bwa_mem-se.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/bwa-se.compare
# By vg-ref:
echo vg ref paired mapping
time vg map $vg_map_opts -iG $output/sim.gam -x $ref_xg -g $ref_gcsa -t $threads --refpos-table | pv -l | sort >$output/vg-ref-pe.pos
join $output/vg-ref-pe.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/vg-ref-pe.compare
echo vg ref single mapping
time vg map $vg_map_opts -G $output/sim.gam -x $ref_xg -g $ref_gcsa -t $threads --refpos-table | pv -l | sort >$output/vg-ref-se.pos
join $output/vg-ref-se.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/vg-ref-se.compare
# By vg-pan:
echo vg pan paired mappping
time vg map $vg_map_opts -iG $output/sim.gam -x $pan_xg -g $pan_gcsa -t $threads --refpos-table | pv -l | sort >$output/vg-pan-pe.pos
join $output/vg-pan-pe.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/vg-pan-pe.compare
echo vg pan single mappping
time vg map $vg_map_opts -G $output/sim.gam -x $pan_xg -g $pan_gcsa -t $threads --refpos-table | pv -l | sort >$output/vg-pan-se.pos
join $output/vg-pan-se.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/vg-pan-se.compare
# Now we combine the various positions into one table
echo combining results
( cat $output/bwa-pe.compare | awk 'BEGIN { OFS="\t"; print "correct", "mq", "score", "length", "unaligned", "known.nodes", "known.bp", "novel.nodes", "novel.bp", "aligner"; } { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "bwa.mem.pe" }' ;
cat $output/bwa-se.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "bwa.mem.se" }' ;
cat $output/vg-ref-pe.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "vg.ref.pe" }' ;
cat $output/vg-ref-se.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "vg.ref.se" }' ;
cat $output/vg-pan-pe.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "vg.pan.pe" }' ;
cat $output/vg-pan-se.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "vg.pan.se" }') | gzip >$output/results-$id.tsv.gz
( zcat $output/results-$id.tsv.gz | head -1; zcat $output/results-$id.tsv.gz | tail -n+2 | awk '{ if ($8 == 0) print }' ) | gzip >$output/results-known-$id.tsv.gz
( zcat $output/results-$id.tsv.gz | head -1; zcat $output/results-$id.tsv.gz | tail -n+2 | awk '{ if ($8 > 0) print }' ) | gzip >$output/results-novel-$id.tsv.gz
# This can then be rendered using scripts in the vg repo
echo rendering ROC
plot-roc.R $output/results-$id.tsv.gz $output/roc-$id.pdf
echo rendering known ROC
plot-roc.R $output/results-known-$id.tsv.gz $output/roc-known-$id.pdf
echo rendering novel ROC
plot-roc.R $output/results-novel-$id.tsv.gz $output/roc-novel-$id.pdf
echo rendering QQ
plot-qq.R $output/results-$id.tsv.gz $output/qq-$id.pdf
|