1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
|
#!/usr/bin/python3
'''
Extract virulence genes by genus from the VFDB database at http://www.mgc.ac.cn/VFs/Down/CP_VFs.ffn.gz
'''
import sys, re
from argparse import ArgumentParser
# BioPython modules for reading and writing sequences
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def parse_args():
parser = ArgumentParser(description='Extract virulence genes by genus from the VFDB database available at http://www.mgc.ac.cn/VFs/Down/CP_VFs.ffn.gz')
parser.add_argument('--infile',
required = True,
help = 'Raw VFDB sequences file (fasta, e.g. download from http://www.mgc.ac.cn/VFs/Down/CP_VFs.ffn.gz)')
parser.add_argument('--genus',
required = False,
help = 'Genus to extract (if not specified, all genera will be extracted to individual files)')
return parser.parse_args()
def main():
args = parse_args()
db = {} # key = genus, value = list of sequences
for record in SeqIO.parse(open(args.infile, "r"), "fasta"):
full_name = record.description
genus = full_name.split("[")[-1].split()[0]
if (not args.genus) or (genus == args.genus):
if genus in db:
db[genus].append(record)
else:
db[genus] = [record]
# Save all alleles from the same genus to separate fasta files
for genus in db:
records = db[genus] # list of records
SeqIO.write(records, (genus + ".fsa"), "fasta")
if __name__ == '__main__':
sys.exit(main())
|