File: evaluate-genotypes-ssi-keras.sh

package info (click to toggle)
libgoby-java 3.3.1%2Bdfsg2-11
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 58,108 kB
  • sloc: java: 78,105; cpp: 5,011; xml: 3,170; python: 2,108; sh: 1,575; ansic: 277; makefile: 114
file content (212 lines) | stat: -rwxr-xr-x 10,631 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env bash
function dieIfError {
    if [ ! $? == 0 ]; then
     echo "An error was encountered ($1)"
     exit;
    fi
}

function assertRTGInstalled {
    echo no | rtg version >/dev/null 2>&1 || { echo >&2 "This script requires rtg but it's not installed. Aborting. Install rtg (see https://github.com/genome-in-a-bottle/giab_FAQ) and add the rtg executable to your path, then try again."; exit 1; }
}
assertRTGInstalled

function assertVcfToolsInstalled {
   echo done| vcf-sort >/dev/null 2>&1 || { echo >&2 "This script requires vcf-sort but it's not installed. Aborting. Install vcf-tools (see https://sourceforge.net/projects/vcftools/files/) and add the vcf-sort executable to your path, then try again."; exit 1; }
}
assertVcfToolsInstalled

function assertBedtoolsInstalled {
   bedtools --version  >/dev/null 2>&1 || { echo >&2 "This script requires bedtools but it's not installed. Aborting. Install bedtools (see http://bedtools.readthedocs.io/en/latest/content/installation.html) and add the bedtools executable to your path, then try again."; exit 1; }
}
assertBedtoolsInstalled

if [ -e configure.sh ]; then
    echo "Loading configure.sh"
    source configure.sh
fi

NUM_ARGS="$#"
NUM_ARGS_EXPECTED="${NUM_ARGS}"

if [ -z "${OUTPUT_PREFIX+set}" ]; then
    NUM_ARGS_EXPECTED=3
fi

if [ -z "${NO_OUTPUT_ORIGINAL+set}" ]; then
    OUTPUT_ORIGINAL=true
fi

if [ -z "${NO_OUTPUT_ERROR+set}" ]; then
    OUTPUT_ERROR=true
fi

if [ "${NUM_ARGS}" == 3 ]; then
    unset OUTPUT_PREFIX
fi

if [ ${NUM_ARGS} != ${NUM_ARGS_EXPECTED} ] || [ "$1" == "--help" ] || [ "$1" == "-h" ]; then
    echo "Usage: evaluate-genotypes-ssi-keras.sh [model-path test-dir-path output-prefix]"
    echo "The env variables GOLD_STANDARD_VCF_SNP_GZ GOLD_STANDARD_VCF_INDEL_GZ and GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ can be used to change the VCFs and confident region bed."
    echo "The first run downloads these files from the Genome in a Bottle for sample NA12878 when the variables are not defined."
    echo "To skip VCF prediction and use outputted VCFs and BEDs from a previous evaluate run, set OUTPUT_PREFIX to common prefix for SNP and INDEL VCF and BED files, as generated previously."
    echo "Arguments are optional only when OUTPUT_PREFIX is set."
    echo "GOBY_VERSION can be set to specify the version of Goby in the outputted VCF files."
    exit 1
fi

if [ -z "${GOLD_STANDARD_VCF_SNP_GZ+set}" ] || [ -z "${GOLD_STANDARD_VCF_INDEL_GZ+set}" ]; then
    if [ -z "${GOLD_STANDARD_VCF_GZ+set}" ]; then
        echo "Downloading Gold standard Genome in a Bottle VCF. Define GOLD_STANDARD_VCF_GZ to use an alternate Gold Standard."
        rm -fr HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf_phased.vcf.gz.*
        wget ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv3.3.1/GRCh37/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf_phased.vcf.gz
        mv HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf_phased.vcf.gz GIAB-NA12878-confident.vcf.gz
        # add "chr prefix:"
        gzip -c -d GIAB-NA12878-confident.vcf.gz|awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' >GIAB-NA12878-confident-chr.vcf
        bgzip -f GIAB-NA12878-confident-chr.vcf
        tabix -f GIAB-NA12878-confident-chr.vcf.gz
        echo 'export GOLD_STANDARD_VCF_GZ="GIAB-NA12878-confident-chr.vcf.gz"' >>configure.sh
        export GOLD_STANDARD_VCF_GZ="GIAB-NA12878-confident-chr.vcf.gz"
    else
        echo "Formatting GOLD_STANDARD_VCF_GZ VCF for SNPs and indels"
    fi

    # remove non-SNPs:
    gzip -c -d  ${GOLD_STANDARD_VCF_GZ} |awk '{if($0 !~ /^#/) { if (length($4)==1 && length($5)==1) print $0;}  else {print $0}}' >GOLD-confident-chr-snps.vcf
    bgzip -f GOLD-confident-chr-snps.vcf
    tabix -f GOLD-confident-chr-snps.vcf.gz
    export GOLD_STANDARD_VCF_SNP_GZ="GOLD-confident-chr-snps.vcf.gz"

    # keep only indels:
    gzip -c -d  ${GOLD_STANDARD_VCF_GZ} |awk '{if($0 !~ /^#/) { if (length($4)!=1 || length($5)!=1) print $0;}  else {print $0}}' >GOLD-confident-chr-indels.vcf
    bgzip -f GOLD-confident-chr-indels.vcf
    tabix -f GOLD-confident-chr-indels.vcf.gz
    export GOLD_STANDARD_VCF_INDEL_GZ="GOLD-confident-chr-indels.vcf.gz"
    echo 'export GOLD_STANDARD_VCF_SNP_GZ="GOLD-confident-chr-snps.vcf.gz"' >>configure.sh
    echo 'export GOLD_STANDARD_VCF_INDEL_GZ="GOLD-confident-chr-indels.vcf.gz"' >>configure.sh
    echo "Gold standard VCF downloaded for NA12878 (SNPs) and named in configure.sh. Edit GOLD_STANDARD_VCF_SNP_GZ/GOLD_STANDARD_VCF_INDEL_GZ to switch to a different gold-standard validation VCF."
fi


if [ -z "${GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ+set}" ]; then
    echo "Downloading Gold standard Genome in a Bottle Confident Regions (bed)"
    rm -fr HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf.bed*
    wget ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv3.3.1/GRCh37/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf.bed
    mv HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf.bed GIAB-NA12878-confident-regions.bed
    # add "chr prefix:"
    cat  GIAB-NA12878-confident-regions.bed |awk '{print "chr"$1"\t"$2"\t"$3}' >GIAB-NA12878-confident-regions-chr.bed
    bgzip -f GIAB-NA12878-confident-regions-chr.bed
    tabix -f GIAB-NA12878-confident-regions-chr.bed.gz
    rm GIAB-NA12878-confident-regions.bed

    GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ="GIAB-NA12878-confident-regions-chr.bed.gz"
    echo 'export GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ="GIAB-NA12878-confident-regions-chr.bed.gz"' >>configure.sh
    echo "Gold standard confident regions downloaded for NA12878  and named in configure.sh. Edit GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ to switch to a different gold-standard confident region bed file."
fi

if [ -z "${RTG_TEMPLATE+set}" ]; then
      RTG_TEMPLATE="hg19.sdf"
      echo "RTG_TEMPLATE not set, using default=${RTG_TEMPLATE}"
fi

if [ ! -e "${RTG_TEMPLATE}" ]; then
     echo "You must install an rtg template, or build one (rtg format file.fa -o template.sdf) in the current directory. See rtg downloads at http://www.realtimegenomics.com/news/pre-formatted-reference-datasets/"
     exit 1;
fi

if [ -z "${OUTPUT_PREFIX+set}" ]; then
    echo "OUTPUT_PREFIX not defined. Running GenerateVCF first..."
    arg_string="goby-python-with-path.sh dl/GenerateVCF.py --model "$1" --testing "$2" --prefix "$3""
    if [ -n "${GOBY_VERSION}" ]; then
        arg_string=""${arg_string}" --version "${GOBY_VERSION}""
    fi
    if [ -n "${OUTPUT_ORIGINAL}" ]; then
        arg_string=""${arg_string}" --generate-original-vcf"
    fi
    if [ -n "${OUTPUT_ERROR}" ]; then
        arg_string=""${arg_string}" --generate-error-vcf"
    fi
    eval ${arg_string}
    dieIfError "Failed to generate VCF with arguments $*"
    echo "Evaluation with rtg vcfeval starting.."
    OUTPUT_PREFIX="$3"
fi

curr_date="`date +%m`_`date +%d`"
RTG_OUTPUT_FOLDER="output/${curr_date}"
mkdir -p ${RTG_OUTPUT_FOLDER}
RTG_OUTPUT_FOLDER="${RTG_OUTPUT_FOLDER}/$((`ls -l ${RTG_OUTPUT_FOLDER} | grep -c ^d` + 1))"
mkdir -p ${RTG_OUTPUT_FOLDER}

VCF_OUTPUT=${OUTPUT_PREFIX}.vcf
BED_OUTPUT=${OUTPUT_PREFIX}.bed

cp ${VCF_OUTPUT} ${RTG_OUTPUT_FOLDER}
cp ${BED_OUTPUT} ${RTG_OUTPUT_FOLDER}

VCF_OUTPUT_SORTED=`basename ${VCF_OUTPUT} .vcf`-sorted.vcf

if [ ! -e "${VCF_OUTPUT_SORTED}.gz" ]; then
    cat ${VCF_OUTPUT} | vcf-sort > ${VCF_OUTPUT_SORTED}
    dieIfError "Unable to sort prediction VCF."

    bgzip -f ${VCF_OUTPUT_SORTED}
    tabix -f -p vcf ${VCF_OUTPUT_SORTED}.gz
fi

BED_OUTPUT_SORTED=`basename ${BED_OUTPUT} .bed`-sorted.bed

if [ ! -e "${BED_OUTPUT_SORTED}.gz" ]; then
    # note -V option on chromosome key below is necessary on Centos 7, with sort version 8.22,
    # see https://github.com/chapmanb/bcbio-nextgen/issues/624
    sort -k1,1V -k2,2n ${BED_OUTPUT} | bedtools merge > ${BED_OUTPUT_SORTED}
    bgzip -f ${BED_OUTPUT_SORTED}
    tabix -f -p bed ${BED_OUTPUT_SORTED}.gz
fi

cp ${VCF_OUTPUT_SORTED}.gz* ${RTG_OUTPUT_FOLDER}
cp ${BED_OUTPUT_SORTED}.gz* ${RTG_OUTPUT_FOLDER}


VCF_OUTPUT_SORTED_SNPS=`basename ${VCF_OUTPUT_SORTED} .vcf`-snps.vcf
gzip -c -d ${VCF_OUTPUT_SORTED}.gz | awk '{if($0 !~ /^#/) { if (length($4)==1 && length($5)==1) print $0;}  else {print $0}}' > ${VCF_OUTPUT_SORTED_SNPS}
bgzip -f ${VCF_OUTPUT_SORTED_SNPS}
tabix -f -p vcf ${VCF_OUTPUT_SORTED_SNPS}.gz

VCF_OUTPUT_SORTED_INDELS=`basename ${VCF_OUTPUT_SORTED} .vcf`-indels.vcf
gzip -c -d ${VCF_OUTPUT_SORTED}.gz | awk '{if($0 !~ /^#/) { if (length($4)!=1 || length($5)!=1) print $0;}  else {print $0}}' > ${VCF_OUTPUT_SORTED_INDELS}
bgzip -f ${VCF_OUTPUT_SORTED_INDELS}
tabix -f -p vcf ${VCF_OUTPUT_SORTED_INDELS}.gz

rtg vcfeval --baseline=${GOLD_STANDARD_VCF_SNP_GZ}  \
        -c ${VCF_OUTPUT_SORTED_SNPS}.gz -o ${RTG_OUTPUT_FOLDER}/snps --template=${RTG_TEMPLATE}  \
            --evaluation-regions=${GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ} \
            --bed-regions=${BED_OUTPUT_SORTED}.gz \
            --vcf-score-field=P  --sort-order=descending
dieIfError "Failed to run rtg vcfeval for SNPs."
rtg vcfeval --baseline=${GOLD_STANDARD_VCF_INDEL_GZ}  \
        -c ${VCF_OUTPUT_SORTED_INDELS}.gz -o ${RTG_OUTPUT_FOLDER}/indels --template=${RTG_TEMPLATE}  \
            --evaluation-regions=${GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ} \
            --bed-regions=${BED_OUTPUT_SORTED}.gz \
            --vcf-score-field=P  --sort-order=descending
dieIfError "Failed to run rtg vcfeval."

cp ${VCF_OUTPUT_SORTED_SNPS}.gz* ${RTG_OUTPUT_FOLDER}/snps
cp ${VCF_OUTPUT_SORTED_INDELS}.gz* ${RTG_OUTPUT_FOLDER}/indels

RTG_ROCPLOT_OPTIONS="--scores"
rtg rocplot ${RTG_OUTPUT_FOLDER}/snps/snp_roc.tsv.gz --svg ${RTG_OUTPUT_FOLDER}/snps/SNP-ROC.svg ${RTG_ROCPLOT_OPTIONS} --title="SNPs, model ${1}"
dieIfError "Unable to generate SNP ROC plot."

rtg rocplot ${RTG_OUTPUT_FOLDER}/snps/snp_roc.tsv.gz -P --svg ${RTG_OUTPUT_FOLDER}/snps/SNP-PrecisionRecall.svg ${RTG_ROCPLOT_OPTIONS} --title="SNPs, model ${1}"
dieIfError "Unable to generate SNP Precision Recall plot."

rtg rocplot ${RTG_OUTPUT_FOLDER}/indels/non_snp_roc.tsv.gz -P --svg ${RTG_OUTPUT_FOLDER}/indels/INDEL-PrecisionRecall.svg ${RTG_ROCPLOT_OPTIONS} --title="INDELs, model ${1}"
dieIfError "Unable to generate indel Precision Recall plot."

rtg rocplot ${RTG_OUTPUT_FOLDER}/indels/non_snp_roc.tsv.gz --svg ${RTG_OUTPUT_FOLDER}/indels/INDEL-ROC.svg  ${RTG_ROCPLOT_OPTIONS} --title="INDELs, model ${1}"
dieIfError "Unable to generate indel ROC plot."