File: map-sim

package info (click to toggle)
vg 1.59.0%2Bds-0.1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 361,528 kB
  • sloc: cpp: 479,590; ansic: 191,648; python: 23,671; javascript: 13,961; sh: 7,025; makefile: 5,577; perl: 3,636; lisp: 293; java: 136
file content (109 lines) | stat: -rwxr-xr-x 5,812 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash

if [ $# -ne 13 ];
then
    echo "usage: "$(basename $0) "[output-dir] [fasta-ref] [vg-ref] [vg-pan] [hap0-base] [hap1-base] [sim-base] [sim-ref] [threads] [sim-read-spec] [sim-seed] [bp-threshold] [vg-map-opts]"
    echo "example: "$(basename $0) 'SGRP2/SGD_2010.fasta SGRP2/SGRP2-cerevisiae.pathonly SGRP2/SGRP2-cerevisiae SGRP2/SGRP2-cerevisiae BC187-haps0 BC187-haps1 BC187 BC187.ref 4 "-n 50000 -e 0.01 -i 0.002 -l 150 -p 500 -v 50" 27 150 "-u 16"'
    exit
fi

output=$1
fasta=$2
ref=$3
pan=$4
haps1=$5
haps2=$6
sim=$7
sim_ref=$8
threads=$9
read_spec=${10}
seed=${11}
threshold=${12}
vg_map_opts=${13}

pan_xg=$pan.xg
pan_gcsa=$pan.gcsa
ref_xg=$ref.xg
ref_gcsa=$ref.gcsa
haps1_xg=$haps1.xg
haps2_xg=$haps2.xg
sim_xg=$sim.xg
sim_ref_xg=$sim_ref.xg

echo $sim_xg $ref_xg $pan_xg

mkdir -p $output

# Get the vg id
id=$(vg version | cut -f 3 -d- | tail -c 8 | head -c 7)
echo testing vg-$id

# generate the simulated reads if we haven't already
if [ ! -e $output/sim.gam.truth.tsv ];
then
    echo generating simulated reads
    # -s 271 -n $num_reads -e 0.01 -i 0.002 -l 150 -p 500 -v 50
    vg sim $read_spec -s $seed -x $haps1_xg -a >$output/sim1.gam &
    vg sim $read_spec -s $(echo "$seed + 1" | bc) -x $haps2_xg -a >$output/sim2.gam &
    wait
    cat $output/sim1.gam $output/sim2.gam >$output/sim.gam    
    rm -f $output/sim1.gam $output/sim2.gam
    vg annotate -p -x $sim_xg -a $output/sim.gam | vg view -a - | jq -c -r '[ .name, .refpos[0].name, .refpos[0].offset ] | @tsv' | pv -l | sort >$output/truth.tsv
    vg annotate -n -x $sim_ref_xg -a $output/sim.gam | tail -n+2 | pv -l | sort >$output/novelty.tsv
    join $output/truth.tsv $output/novelty.tsv >$output/sim.gam.truth.tsv
    # split the file into the mates
    vg view -X $output/sim.gam | gzip >$output/sim.fq.gz &
    vg view -a $output/sim.gam | jq -cr 'select(.name | test("_1$"))' | vg view -JaG - | vg view -X - | sed s/_1$// | gzip >$output/sim_1.fq.gz &
    vg view -a $output/sim.gam | jq -cr 'select(.name | test("_2$"))' | vg view -JaG - | vg view -X - | sed s/_2$// | gzip >$output/sim_2.fq.gz &
    wait
fi

# This can then be mapped six ways.

# By bwa:
echo bwa mem paired mapping
time bwa mem -t $threads $fasta $output/sim_1.fq.gz $output/sim_2.fq.gz | grep -v ^@ | perl -ne '@val = split("\t", $_); print @val[0] . "_" . (@val[1] & 64 ? "1" : @val[1] & 128 ? "2" : "?"), "\t" . @val[2] . "\t" . (@val[3] +  int(length(@val[9]) / 2)) . "\t" . @val[4] . "\t" . @val[13] . "\n";' | sed s/AS:i:// | pv -l | sort >$output/bwa_mem-pe.pos
join $output/bwa_mem-pe.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/bwa-pe.compare
# map single end
echo bwa mem single mapping
time bwa mem -t $threads $fasta $output/sim.fq.gz | grep -v ^@ | pv -l | awk -v OFS="\t" '{$4=($4 + int(length($10) / 2)); print}' | cut -f 1,3,4,5,14 | sed s/AS:i:// | sort >$output/bwa_mem-se.pos
join $output/bwa_mem-se.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/bwa-se.compare

# By vg-ref:
echo vg ref paired mapping
time vg map $vg_map_opts -iG $output/sim.gam -x $ref_xg -g $ref_gcsa -t $threads --refpos-table | pv -l | sort >$output/vg-ref-pe.pos
join $output/vg-ref-pe.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/vg-ref-pe.compare
echo vg ref single mapping
time vg map $vg_map_opts -G $output/sim.gam -x $ref_xg -g $ref_gcsa -t $threads --refpos-table | pv -l | sort >$output/vg-ref-se.pos
join $output/vg-ref-se.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/vg-ref-se.compare

# By vg-pan:
echo vg pan paired mappping
time vg map $vg_map_opts -iG $output/sim.gam -x $pan_xg -g $pan_gcsa -t $threads --refpos-table | pv -l | sort >$output/vg-pan-pe.pos
join $output/vg-pan-pe.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/vg-pan-pe.compare
echo vg pan single mappping
time vg map $vg_map_opts -G $output/sim.gam -x $pan_xg -g $pan_gcsa -t $threads --refpos-table | pv -l | sort >$output/vg-pan-se.pos
join $output/vg-pan-se.pos $output/sim.gam.truth.tsv | vg_sim_pos_compare.py $threshold >$output/vg-pan-se.compare

# Now we combine the various positions into one table

echo combining results
( cat $output/bwa-pe.compare | awk 'BEGIN { OFS="\t"; print "correct", "mq", "score", "length", "unaligned", "known.nodes", "known.bp", "novel.nodes", "novel.bp", "aligner"; } { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "bwa.mem.pe" }' ;
  cat $output/bwa-se.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "bwa.mem.se" }' ;
  cat $output/vg-ref-pe.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "vg.ref.pe" }' ;
  cat $output/vg-ref-se.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "vg.ref.se" }' ;
  cat $output/vg-pan-pe.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "vg.pan.pe" }' ;
  cat $output/vg-pan-se.compare | awk 'BEGIN { OFS="\t"} { print $2, $3, $4, $5, $6, $7, $8, $9, $10, "vg.pan.se" }') | gzip >$output/results-$id.tsv.gz

( zcat $output/results-$id.tsv.gz | head -1; zcat $output/results-$id.tsv.gz | tail -n+2 | awk '{ if ($8 == 0) print }' ) | gzip >$output/results-known-$id.tsv.gz
( zcat $output/results-$id.tsv.gz | head -1; zcat $output/results-$id.tsv.gz | tail -n+2 | awk '{ if ($8 > 0) print }' ) | gzip >$output/results-novel-$id.tsv.gz

# This can then be rendered using scripts in the vg repo
echo rendering ROC
plot-roc.R $output/results-$id.tsv.gz $output/roc-$id.pdf
echo rendering known ROC
plot-roc.R $output/results-known-$id.tsv.gz $output/roc-known-$id.pdf
echo rendering novel ROC
plot-roc.R $output/results-novel-$id.tsv.gz $output/roc-novel-$id.pdf
echo rendering QQ
plot-qq.R $output/results-$id.tsv.gz $output/qq-$id.pdf