File: VFDBgenus.py

package info (click to toggle)
srst2 0.2.0-9
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 8,976 kB
  • sloc: python: 3,115; sh: 50; makefile: 29
file content (45 lines) | stat: -rw-r--r-- 1,470 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/python3
'''
Extract virulence genes by genus from the VFDB database at http://www.mgc.ac.cn/VFs/Down/CP_VFs.ffn.gz
'''

import sys, re
from argparse import ArgumentParser

# BioPython modules for reading and writing sequences
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def parse_args():
	parser = ArgumentParser(description='Extract virulence genes by genus from the VFDB database available at http://www.mgc.ac.cn/VFs/Down/CP_VFs.ffn.gz')

	parser.add_argument('--infile',
						required = True,
						help = 'Raw VFDB sequences file (fasta, e.g. download from http://www.mgc.ac.cn/VFs/Down/CP_VFs.ffn.gz)')
	parser.add_argument('--genus',
						required = False,
						help = 'Genus to extract (if not specified, all genera will be extracted to individual files)')
	return parser.parse_args()  

def main():
	args = parse_args()
	
	db = {} # key = genus, value = list of sequences
	
	for record in SeqIO.parse(open(args.infile, "r"), "fasta"):
		full_name = record.description
		genus = full_name.split("[")[-1].split()[0]
		if (not args.genus) or (genus == args.genus):
			if genus in db:
				db[genus].append(record)
			else:
				db[genus] = [record]

	# Save all alleles from the same genus to separate fasta files
	for genus in db:
		records = db[genus] # list of records
		SeqIO.write(records, (genus + ".fsa"), "fasta")

if __name__ == '__main__':
	sys.exit(main())