File: benchmark

package info (click to toggle)
cutesv 2.1.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,360 kB
  • sloc: python: 5,192; sh: 102; makefile: 12
file content (101 lines) | stat: -rwxr-xr-x 4,719 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/bin/sh -e

echo "We need to package https://github.com/PacificBiosciences/pbsv first!"
echo "Also https://github.com/ACEnglish/truvari for comparison would be nice to have."
exit

# https://github.com/tjiangHIT/sv-benchmark

# 1. Create directory structure
mkdir -p fastqs ref alns tools/{pbsv,sniffles,svim,cuteSV} giab

# 2. Download genome in a bottle annotations:
FTPDIR=ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/analysis/
curl -s ${FTPDIR}/NIST_SVs_Integration_v0.6/HG002_SVs_Tier1_v0.6.bed > giab/HG002_SVs_Tier1_v0.6.bed
curl -s ${FTPDIR}/NIST_SVs_Integration_v0.6/HG002_SVs_Tier1_v0.6.vcf.gz > giab/HG002_SVs_Tier1_v0.6.vcf.gz

# 3. Download hg19 reference with decoys and map non-ACGT characters to N:
curl -s ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz > ref/human_hs37d5.fasta.gz
gunzip ref/human_hs37d5.fasta.gz
sed -i '/^[^>]/ y/BDEFHIJKLMNOPQRSUVWXYZbdefhijklmnopqrsuvwxyz/NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN/' ref/human_hs37d5.fasta

# 4. Download hg19 tandem repeat annotations:
curl -s https://raw.githubusercontent.com/PacificBiosciences/pbsv/master/annotations/human_hs37d5.trf.bed > ref/human_hs37d5.trf.bed

# 5. Download all .fastq files:
FTPDIR=ftp://ftp.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/UCSC_Ultralong_OxfordNanopore_Promethion/
for fastq in $(curl -s -l ${FTPDIR} | grep '.fastq'); do curl -s ${FTPDIR}${fastq} > fastqs/${fastq}; done
for fastq in `ls fastqs`; do gunzip fastqs/${fastq}; done

## Alignment
# 6. Align each movie:
for i in `ls fastqs/`;do
    minimap2 ref/human_hs37d5.fasta fastq/$i -a -z 600,200 -x map-ont \
        --MD -Y -o alns/$i.sam -R '@RG\tID:hg2'
done

# 7. Alignments sorting and merging:
for i in `ls alns/`;do
    samtools view -buS alns/$i | samtools sort -O bam -T ./ - > alns/$i.bam
done
for i in `ls alns/*.bam`; do echo $i; done > input_bam.fofn
samtools merge alns/GM24385_all.bam -b input_bam.fofn && samtools index alns/GM24385_all.bam

## Run pbsv
# 8a) Discover SV signatures for each alignment, can be done in parallel:
pbsv discover alns/GM24385_all.bam tools/pbsv/hg2_ont.svsig.gz" --tandem-repeats ref/human_hs37d5.trf.bed

# 8b) Call and polish SVs:
pbsv call ref/human_hs37d5.fasta tools/pbsv/hg2_ont.svsig.gz tools/pbsv/hg2_ont.pbsv.vcf -t INS,DEL
bgzip tools/pbsv/hg2_ont.pbsv.vcf
tabix tools/pbsv/hg2_ont.pbsv.vcf.gz

## Run svim
#9a) Run svim:
svim alignment tools/svim alns/GM24385_all.bam ref/human_hs37d5.fasta --min_sv_size 30

#9b) Prepare for truvari:
cat tools/svim/final_results.vcf \
    | sed 's/INS:NOVEL/INS/g' \
    | sed 's/DUP:INT/INS/g' \
    | sed 's/DUP:TANDEM/INS/g' \
    | awk '{ if($1 ~ /^#/) { print $0 } else { if($5=="<DEL>" || $5=="<INS>") { print $0 }}}' \
    | grep -v 'SUPPORT=1;\|SUPPORT=2;\|SUPPORT=3;\|SUPPORT=4;\|SUPPORT=5;\|SUPPORT=6;\|SUPPORT=7;\|SUPPORT=8;\|SUPPORT=9;' \
    | sed 's/q5/PASS/g' > tools/svim/hg2_ont.svim.vcf
bgzip tools/svim/hg2_ont.svim.vcf
tabix tools/svim/hg2_ont.svim.vcf.gz

## Run sniffles
# 10a) Run sniffles:
sniffles -s 10 -l 30 -m alns/GM24385_all.bam -v tools/sniffles/hg2_ont.sniffles.vcf --genotype

# 10b) Prepare for truvari:
cat <(cat tools/sniffles/hg2_ont.sniffles.vcf | grep "^#") \
    <(cat tools/sniffles/hg2_ont.sniffles.vcf | grep -vE "^#" | \
      grep 'DUP\|INS\|DEL' | sed 's/DUP/INS/g' | sort -k1,1 -k2,2g) \
    | bgzip -c > tools/sniffles/hg2_ont.sniffles.vcf.gz
tabix tools/sniffles/hg2_ont.sniffles.vcf.gz

## Run cuteSV
# 11a) Run cuteSV:
cuteSV alns/GM24385_all.bam tools/cuteSV/hg2_ont.cuteSV.vcf ./ -s 10 -l 30

# 11b) Prepare for truvari:
grep -v 'INV\|DUP\|BND' tools/cuteSV/hg2_ont.cuteSV.vcf | bgzip -c > tools/cuteSV/hg2_ont.cuteSV.vcf.gz
tabix tools/cuteSV/hg2_ont.cuteSV.vcf.gz

## Final comparison
# 11. Compare to ground truth:

truvari -f ref/human_hs37d5.fasta -b giab/HG002_SVs_Tier1_v0.6.vcf.gz\
        --includebed giab/HG002_SVs_Tier1_v0.6.bed -o bench-pbsv --passonly\
        --giabreport -r 1000 -p 0.00 -c tools/pbsv/hg2.pbsv.vcf.gz
truvari -f ref/human_hs37d5.fasta -b giab/HG002_SVs_Tier1_v0.6.vcf.gz\
        --includebed giab/HG002_SVs_Tier1_v0.6.bed -o bench-svim --passonly\
        --giabreport -r 1000 -p 0.00 -c tools/svim/hg2.svim.vcf.gz
truvari -f ref/human_hs37d5.fasta -b giab/HG002_SVs_Tier1_v0.6.vcf.gz\
        --includebed giab/HG002_SVs_Tier1_v0.6.bed -o bench-sniffles --passonly\
        --giabreport -r 1000 -p 0.00 -c tools/sniffles/hg2.sniffles.vcf.gz
truvari -f ref/human_hs37d5.fasta -b giab/HG002_SVs_Tier1_v0.6.vcf.gz\
        --includebed giab/HG002_SVs_Tier1_v0.6.bed -o bench-cuteSV --passonly\
        --giabreport -r 1000 -p 0.00 -c tools/cuteSV/hg2_ont.cuteSV.vcf.gz