File: VFDB_cdhit_to_csv.py

package info (click to toggle)
srst2 0.2.0-9
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 8,976 kB
  • sloc: python: 3,115; sh: 50; makefile: 29
file content (69 lines) | stat: -rw-r--r-- 2,041 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/python3
'''
Created on 19/06/2013
@author: Harriet Dashnow, Kat Holt
Output a comma-separated table of alleles, their inferred gene name (from annotation)
and their 90% cluster from CD-HIT analysis
'''

import sys, re
from Bio import SeqIO
from argparse import ArgumentParser


def parse_args():
	parser = ArgumentParser(description='Parse cdhit results')

	parser.add_argument('--cluster_file',
						required = True,
						help = 'cd hit output file (.clstr)')
	parser.add_argument('--infile',
						required = True,
						help = 'raw sequences file that was input to cdhit (fasta)')
	parser.add_argument('--outfile',
						required = True,
						help = 'output file (csv)')
	return parser.parse_args()

def main():
	args = parse_args()
	outfile = file(args.outfile,"w")
	outfile.write("seqID,clusterid,gene,allele,DNA,annotation\n")

	database = {} # key = clusterid, value = list of seqIDs
	seq2cluster = {} # key = seqID, value = clusterid

	for line in open(args.cluster_file):
		if line.startswith(">"):
			ClusterNr = line.split()[1]
			continue

		line_split =  line.split(">")
		seqID = line_split[1].split("(")[0]

		if ClusterNr not in database:
			database[ClusterNr] = []
		if seqID not in database[ClusterNr]:
			database[ClusterNr].append(seqID) # for virulence gene DB, this is the unique ID R0xxx
		seq2cluster[seqID] = ClusterNr

	for record in SeqIO.parse(open(args.infile, "r"), "fasta"):
		full_name = record.description
		genus = full_name.split("[")[-1].split()[0]
		in_brackets = re.findall('\((.*?)\)', full_name)
		seqID = full_name.split("(")[0]
		if in_brackets[0].startswith('gi:'):
			gene = in_brackets[1]
		else:
			gene = in_brackets[0]
		gene = gene.split()[0]
		allele = gene + "_" + full_name.split(")]")[0].split("(")[-1]
		if seqID in seq2cluster:
			clusterid = seq2cluster[seqID]
			outstring = ",".join([seqID, clusterid, gene, allele, str(record.seq), re.sub(",","",record.description)]) + "\n"
			outfile.write(outstring)
	outfile.close()


if __name__ == '__main__':
	sys.exit(main())