1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
|
#!/usr/bin/env bash
function dieIfError {
if [ ! $? == 0 ]; then
echo "An error was encountered ($1)"
exit;
fi
}
function assertRTGInstalled {
echo no | rtg version >/dev/null 2>&1 || { echo >&2 "This script requires rtg but it's not installed. Aborting. Install rtg (see https://github.com/genome-in-a-bottle/giab_FAQ) and add the rtg executable to your path, then try again."; exit 1; }
}
assertRTGInstalled
function assertVcfToolsInstalled {
echo done| vcf-sort >/dev/null 2>&1 || { echo >&2 "This script requires vcf-sort but it's not installed. Aborting. Install vcf-tools (see https://sourceforge.net/projects/vcftools/files/) and add the vcf-sort executable to your path, then try again."; exit 1; }
}
assertVcfToolsInstalled
function assertBedtoolsInstalled {
bedtools --version >/dev/null 2>&1 || { echo >&2 "This script requires bedtools but it's not installed. Aborting. Install bedtools (see http://bedtools.readthedocs.io/en/latest/content/installation.html) and add the bedtools executable to your path, then try again."; exit 1; }
}
assertBedtoolsInstalled
if [ -e configure.sh ]; then
echo "Loading configure.sh"
source configure.sh
fi
NUM_ARGS="$#"
NUM_ARGS_EXPECTED="${NUM_ARGS}"
if [ -z "${OUTPUT_PREFIX+set}" ]; then
NUM_ARGS_EXPECTED=3
fi
if [ -z "${NO_OUTPUT_ORIGINAL+set}" ]; then
OUTPUT_ORIGINAL=true
fi
if [ -z "${NO_OUTPUT_ERROR+set}" ]; then
OUTPUT_ERROR=true
fi
if [ "${NUM_ARGS}" == 3 ]; then
unset OUTPUT_PREFIX
fi
if [ ${NUM_ARGS} != ${NUM_ARGS_EXPECTED} ] || [ "$1" == "--help" ] || [ "$1" == "-h" ]; then
echo "Usage: evaluate-genotypes-ssi-keras.sh [model-path test-dir-path output-prefix]"
echo "The env variables GOLD_STANDARD_VCF_SNP_GZ GOLD_STANDARD_VCF_INDEL_GZ and GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ can be used to change the VCFs and confident region bed."
echo "The first run downloads these files from the Genome in a Bottle for sample NA12878 when the variables are not defined."
echo "To skip VCF prediction and use outputted VCFs and BEDs from a previous evaluate run, set OUTPUT_PREFIX to common prefix for SNP and INDEL VCF and BED files, as generated previously."
echo "Arguments are optional only when OUTPUT_PREFIX is set."
echo "GOBY_VERSION can be set to specify the version of Goby in the outputted VCF files."
exit 1
fi
if [ -z "${GOLD_STANDARD_VCF_SNP_GZ+set}" ] || [ -z "${GOLD_STANDARD_VCF_INDEL_GZ+set}" ]; then
if [ -z "${GOLD_STANDARD_VCF_GZ+set}" ]; then
echo "Downloading Gold standard Genome in a Bottle VCF. Define GOLD_STANDARD_VCF_GZ to use an alternate Gold Standard."
rm -fr HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf_phased.vcf.gz.*
wget ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv3.3.1/GRCh37/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf_phased.vcf.gz
mv HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf_phased.vcf.gz GIAB-NA12878-confident.vcf.gz
# add "chr prefix:"
gzip -c -d GIAB-NA12878-confident.vcf.gz|awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' >GIAB-NA12878-confident-chr.vcf
bgzip -f GIAB-NA12878-confident-chr.vcf
tabix -f GIAB-NA12878-confident-chr.vcf.gz
echo 'export GOLD_STANDARD_VCF_GZ="GIAB-NA12878-confident-chr.vcf.gz"' >>configure.sh
export GOLD_STANDARD_VCF_GZ="GIAB-NA12878-confident-chr.vcf.gz"
else
echo "Formatting GOLD_STANDARD_VCF_GZ VCF for SNPs and indels"
fi
# remove non-SNPs:
gzip -c -d ${GOLD_STANDARD_VCF_GZ} |awk '{if($0 !~ /^#/) { if (length($4)==1 && length($5)==1) print $0;} else {print $0}}' >GOLD-confident-chr-snps.vcf
bgzip -f GOLD-confident-chr-snps.vcf
tabix -f GOLD-confident-chr-snps.vcf.gz
export GOLD_STANDARD_VCF_SNP_GZ="GOLD-confident-chr-snps.vcf.gz"
# keep only indels:
gzip -c -d ${GOLD_STANDARD_VCF_GZ} |awk '{if($0 !~ /^#/) { if (length($4)!=1 || length($5)!=1) print $0;} else {print $0}}' >GOLD-confident-chr-indels.vcf
bgzip -f GOLD-confident-chr-indels.vcf
tabix -f GOLD-confident-chr-indels.vcf.gz
export GOLD_STANDARD_VCF_INDEL_GZ="GOLD-confident-chr-indels.vcf.gz"
echo 'export GOLD_STANDARD_VCF_SNP_GZ="GOLD-confident-chr-snps.vcf.gz"' >>configure.sh
echo 'export GOLD_STANDARD_VCF_INDEL_GZ="GOLD-confident-chr-indels.vcf.gz"' >>configure.sh
echo "Gold standard VCF downloaded for NA12878 (SNPs) and named in configure.sh. Edit GOLD_STANDARD_VCF_SNP_GZ/GOLD_STANDARD_VCF_INDEL_GZ to switch to a different gold-standard validation VCF."
fi
if [ -z "${GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ+set}" ]; then
echo "Downloading Gold standard Genome in a Bottle Confident Regions (bed)"
rm -fr HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf.bed*
wget ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv3.3.1/GRCh37/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf.bed
mv HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.1_highconf.bed GIAB-NA12878-confident-regions.bed
# add "chr prefix:"
cat GIAB-NA12878-confident-regions.bed |awk '{print "chr"$1"\t"$2"\t"$3}' >GIAB-NA12878-confident-regions-chr.bed
bgzip -f GIAB-NA12878-confident-regions-chr.bed
tabix -f GIAB-NA12878-confident-regions-chr.bed.gz
rm GIAB-NA12878-confident-regions.bed
GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ="GIAB-NA12878-confident-regions-chr.bed.gz"
echo 'export GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ="GIAB-NA12878-confident-regions-chr.bed.gz"' >>configure.sh
echo "Gold standard confident regions downloaded for NA12878 and named in configure.sh. Edit GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ to switch to a different gold-standard confident region bed file."
fi
if [ -z "${RTG_TEMPLATE+set}" ]; then
RTG_TEMPLATE="hg19.sdf"
echo "RTG_TEMPLATE not set, using default=${RTG_TEMPLATE}"
fi
if [ ! -e "${RTG_TEMPLATE}" ]; then
echo "You must install an rtg template, or build one (rtg format file.fa -o template.sdf) in the current directory. See rtg downloads at http://www.realtimegenomics.com/news/pre-formatted-reference-datasets/"
exit 1;
fi
if [ -z "${OUTPUT_PREFIX+set}" ]; then
echo "OUTPUT_PREFIX not defined. Running GenerateVCF first..."
arg_string="goby-python-with-path.sh dl/GenerateVCF.py --model "$1" --testing "$2" --prefix "$3""
if [ -n "${GOBY_VERSION}" ]; then
arg_string=""${arg_string}" --version "${GOBY_VERSION}""
fi
if [ -n "${OUTPUT_ORIGINAL}" ]; then
arg_string=""${arg_string}" --generate-original-vcf"
fi
if [ -n "${OUTPUT_ERROR}" ]; then
arg_string=""${arg_string}" --generate-error-vcf"
fi
eval ${arg_string}
dieIfError "Failed to generate VCF with arguments $*"
echo "Evaluation with rtg vcfeval starting.."
OUTPUT_PREFIX="$3"
fi
curr_date="`date +%m`_`date +%d`"
RTG_OUTPUT_FOLDER="output/${curr_date}"
mkdir -p ${RTG_OUTPUT_FOLDER}
RTG_OUTPUT_FOLDER="${RTG_OUTPUT_FOLDER}/$((`ls -l ${RTG_OUTPUT_FOLDER} | grep -c ^d` + 1))"
mkdir -p ${RTG_OUTPUT_FOLDER}
VCF_OUTPUT=${OUTPUT_PREFIX}.vcf
BED_OUTPUT=${OUTPUT_PREFIX}.bed
cp ${VCF_OUTPUT} ${RTG_OUTPUT_FOLDER}
cp ${BED_OUTPUT} ${RTG_OUTPUT_FOLDER}
VCF_OUTPUT_SORTED=`basename ${VCF_OUTPUT} .vcf`-sorted.vcf
if [ ! -e "${VCF_OUTPUT_SORTED}.gz" ]; then
cat ${VCF_OUTPUT} | vcf-sort > ${VCF_OUTPUT_SORTED}
dieIfError "Unable to sort prediction VCF."
bgzip -f ${VCF_OUTPUT_SORTED}
tabix -f -p vcf ${VCF_OUTPUT_SORTED}.gz
fi
BED_OUTPUT_SORTED=`basename ${BED_OUTPUT} .bed`-sorted.bed
if [ ! -e "${BED_OUTPUT_SORTED}.gz" ]; then
# note -V option on chromosome key below is necessary on Centos 7, with sort version 8.22,
# see https://github.com/chapmanb/bcbio-nextgen/issues/624
sort -k1,1V -k2,2n ${BED_OUTPUT} | bedtools merge > ${BED_OUTPUT_SORTED}
bgzip -f ${BED_OUTPUT_SORTED}
tabix -f -p bed ${BED_OUTPUT_SORTED}.gz
fi
cp ${VCF_OUTPUT_SORTED}.gz* ${RTG_OUTPUT_FOLDER}
cp ${BED_OUTPUT_SORTED}.gz* ${RTG_OUTPUT_FOLDER}
VCF_OUTPUT_SORTED_SNPS=`basename ${VCF_OUTPUT_SORTED} .vcf`-snps.vcf
gzip -c -d ${VCF_OUTPUT_SORTED}.gz | awk '{if($0 !~ /^#/) { if (length($4)==1 && length($5)==1) print $0;} else {print $0}}' > ${VCF_OUTPUT_SORTED_SNPS}
bgzip -f ${VCF_OUTPUT_SORTED_SNPS}
tabix -f -p vcf ${VCF_OUTPUT_SORTED_SNPS}.gz
VCF_OUTPUT_SORTED_INDELS=`basename ${VCF_OUTPUT_SORTED} .vcf`-indels.vcf
gzip -c -d ${VCF_OUTPUT_SORTED}.gz | awk '{if($0 !~ /^#/) { if (length($4)!=1 || length($5)!=1) print $0;} else {print $0}}' > ${VCF_OUTPUT_SORTED_INDELS}
bgzip -f ${VCF_OUTPUT_SORTED_INDELS}
tabix -f -p vcf ${VCF_OUTPUT_SORTED_INDELS}.gz
rtg vcfeval --baseline=${GOLD_STANDARD_VCF_SNP_GZ} \
-c ${VCF_OUTPUT_SORTED_SNPS}.gz -o ${RTG_OUTPUT_FOLDER}/snps --template=${RTG_TEMPLATE} \
--evaluation-regions=${GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ} \
--bed-regions=${BED_OUTPUT_SORTED}.gz \
--vcf-score-field=P --sort-order=descending
dieIfError "Failed to run rtg vcfeval for SNPs."
rtg vcfeval --baseline=${GOLD_STANDARD_VCF_INDEL_GZ} \
-c ${VCF_OUTPUT_SORTED_INDELS}.gz -o ${RTG_OUTPUT_FOLDER}/indels --template=${RTG_TEMPLATE} \
--evaluation-regions=${GOLD_STANDARD_CONFIDENT_REGIONS_BED_GZ} \
--bed-regions=${BED_OUTPUT_SORTED}.gz \
--vcf-score-field=P --sort-order=descending
dieIfError "Failed to run rtg vcfeval."
cp ${VCF_OUTPUT_SORTED_SNPS}.gz* ${RTG_OUTPUT_FOLDER}/snps
cp ${VCF_OUTPUT_SORTED_INDELS}.gz* ${RTG_OUTPUT_FOLDER}/indels
RTG_ROCPLOT_OPTIONS="--scores"
rtg rocplot ${RTG_OUTPUT_FOLDER}/snps/snp_roc.tsv.gz --svg ${RTG_OUTPUT_FOLDER}/snps/SNP-ROC.svg ${RTG_ROCPLOT_OPTIONS} --title="SNPs, model ${1}"
dieIfError "Unable to generate SNP ROC plot."
rtg rocplot ${RTG_OUTPUT_FOLDER}/snps/snp_roc.tsv.gz -P --svg ${RTG_OUTPUT_FOLDER}/snps/SNP-PrecisionRecall.svg ${RTG_ROCPLOT_OPTIONS} --title="SNPs, model ${1}"
dieIfError "Unable to generate SNP Precision Recall plot."
rtg rocplot ${RTG_OUTPUT_FOLDER}/indels/non_snp_roc.tsv.gz -P --svg ${RTG_OUTPUT_FOLDER}/indels/INDEL-PrecisionRecall.svg ${RTG_ROCPLOT_OPTIONS} --title="INDELs, model ${1}"
dieIfError "Unable to generate indel Precision Recall plot."
rtg rocplot ${RTG_OUTPUT_FOLDER}/indels/non_snp_roc.tsv.gz --svg ${RTG_OUTPUT_FOLDER}/indels/INDEL-ROC.svg ${RTG_ROCPLOT_OPTIONS} --title="INDELs, model ${1}"
dieIfError "Unable to generate indel ROC plot."
|