1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
|
#!/bin/bash
#this is a utility bash script that automates generation of all the VFDB gene databases for use with srst2.py
#script assumes you already have python3, and cd-hit installed somewhere on the $PATH
#this script MUST be in the same folder as all the other database_clustering python3 scripts
#example usage:
#/srst2/database_clustering/get_all_vfdb.sh ./CP_VFs.ffn ./VFDB
if [ $# != 2 ] ; then
echo "Usage: `basename $0` <vfdbfile> <outputfolder>"
exit 1
fi
VFDBFILE=$(readlink -e $1)
OUTPUTFOLDER=$2
#get the srst2/database_clustering folder where all the other python3 scripts live side-by-side with this one
DBCLUSTERINGSCRIPTFOLDER=$(dirname $(readlink -e $0))
#if the specified output folder doesn't exist, then create it
if [ ! -d ${OUTPUTFOLDER} ]; then
mkdir ${OUTPUTFOLDER}
fi
cd ${OUTPUTFOLDER}
#extract virulence genes from all available genera into separate files
python3 ${DBCLUSTERINGSCRIPTFOLDER}/VFDBgenus.py --infile ${VFDBFILE}
#loop over each genus' *.fsa file and generate the gene database fasta file
for FSAFILE in *.fsa; do
FILENAME=${FSAFILE##*/}
GENUS=${FILENAME%.*}
mkdir ${GENUS}
echo Generating virulence gene database for ${GENUS}
#Run CD-HIT to cluster the sequences for this genus, at 90% nucleotide identity
cd-hit -i ${FILENAME} -o ${GENUS}/${GENUS}_cdhit90 -c 0.9 > ${GENUS}/${GENUS}_cdhit90.stdout
#Parse the cluster output and tabulate the results using the specific Virulence gene DB compatible script:
python3 ${DBCLUSTERINGSCRIPTFOLDER}/VFDB_cdhit_to_csv.py --cluster_file ${GENUS}/${GENUS}_cdhit90.clstr --infile ${FILENAME} --outfile ${GENUS}/${GENUS}_cdhit90.csv
#Convert the resulting csv table to a SRST2-compatible sequence
python3 ${DBCLUSTERINGSCRIPTFOLDER}/csv_to_gene_db.py -t ${GENUS}/${GENUS}_cdhit90.csv -o ${GENUS}/${GENUS}_VF_clustered.fasta -s 5
#move the original *.fsa file to the created genus subfolder
mv ${FILENAME} ${GENUS}/${FILENAME}
done
|