File: get_all_vfdb.sh

package info (click to toggle)
srst2 0.2.0-9
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 8,976 kB
  • sloc: python: 3,115; sh: 50; makefile: 29
file content (46 lines) | stat: -rw-r--r-- 1,952 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/bash
#this is a utility bash script that automates generation of all the VFDB gene databases for use with srst2.py
#script assumes you already have python3, and cd-hit installed somewhere on the $PATH
#this script MUST be in the same folder as all the other database_clustering python3 scripts
#example usage:
#/srst2/database_clustering/get_all_vfdb.sh ./CP_VFs.ffn ./VFDB

if [ $# != 2 ] ; then
  echo "Usage: `basename $0` <vfdbfile> <outputfolder>"
  exit 1
fi

VFDBFILE=$(readlink -e $1)
OUTPUTFOLDER=$2
#get the srst2/database_clustering folder where all the other python3 scripts live side-by-side with this one
DBCLUSTERINGSCRIPTFOLDER=$(dirname $(readlink -e $0))

#if the specified output folder doesn't exist, then create it
if [ ! -d ${OUTPUTFOLDER} ]; then
  mkdir ${OUTPUTFOLDER}
fi
cd ${OUTPUTFOLDER}

#extract virulence genes from all available genera into separate files
python3 ${DBCLUSTERINGSCRIPTFOLDER}/VFDBgenus.py --infile ${VFDBFILE}

#loop over each genus' *.fsa file and generate the gene database fasta file
for FSAFILE in *.fsa; do
  FILENAME=${FSAFILE##*/}
  GENUS=${FILENAME%.*}
  mkdir ${GENUS}

  echo Generating virulence gene database for ${GENUS}

  #Run CD-HIT to cluster the sequences for this genus, at 90% nucleotide identity
  cd-hit -i ${FILENAME} -o ${GENUS}/${GENUS}_cdhit90 -c 0.9 > ${GENUS}/${GENUS}_cdhit90.stdout

  #Parse the cluster output and tabulate the results using the specific Virulence gene DB compatible script:
  python3 ${DBCLUSTERINGSCRIPTFOLDER}/VFDB_cdhit_to_csv.py --cluster_file ${GENUS}/${GENUS}_cdhit90.clstr --infile ${FILENAME} --outfile ${GENUS}/${GENUS}_cdhit90.csv

  #Convert the resulting csv table to a SRST2-compatible sequence
  python3 ${DBCLUSTERINGSCRIPTFOLDER}/csv_to_gene_db.py -t ${GENUS}/${GENUS}_cdhit90.csv -o ${GENUS}/${GENUS}_VF_clustered.fasta -s 5

  #move the original *.fsa file to the created genus subfolder
  mv ${FILENAME} ${GENUS}/${FILENAME}
done