File: cdhit_to_csv.py

package info (click to toggle)
srst2 0.2.0-9
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 8,976 kB
  • sloc: python: 3,115; sh: 50; makefile: 29
file content (159 lines) | stat: -rw-r--r-- 4,191 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
'''
Created on 19/06/2013
@author: Harriet Dashnow, Kat Holt
Output a tab-separated table of alleles, their inferred gene name (from annotation) 
and their 90% cluster using CD-HIT
Also output fasta files containing all genes sharing the same basic name (prefix before '-' in gene id)
'''

import sys
from Bio import SeqIO
from argparse import ArgumentParser


def parse_args():
	parser = ArgumentParser(description='Parse cdhit results')

	parser.add_argument('--cluster_file',
						required = True,
						help = 'cd hit output file (.clstr)')
	parser.add_argument('--infasta_file',
						required = True,
						help = 'raw sequences file (fasta)')
	parser.add_argument('--outfile',
						required = True,
						help = 'output file (csv)')
	return parser.parse_args()

def remove_trailing_numbers(string):
	index = len(string) - 1	   
	while index >= 0:
		if string[index].isdigit():
			string = string[:index]
		else:
			break
		index -= 1
	return string

def extract_allele_name(gene_allele):
	gene_name = ""
	i = 0
	for part in gene_allele.split("-"):
		
		if not part.isdigit():
			if i !=0:
				gene_name += "-"
			gene_name += part
		i += 1
	
	return remove_trailing_numbers(gene_name)	  

def main():
	args = parse_args()
	outfile = file(args.outfile,"w")
	outfile.write("seqID,clusterid,gene,allele,cluster_contains_multiple_genes,gene_found_in_multiple_clusters\n")
	 
	database = {}
	full_database = {}
	
	for line in open(args.cluster_file):
		if line.startswith(">"):
			ClusterNr = line.split()[1]
			continue
 
		line_split =  line.split()
		QueryLabel = line_split[2].strip("*.>")
		
		if ClusterNr not in full_database:
			full_database[ClusterNr] = []
		if QueryLabel not in full_database[ClusterNr]:
			full_database[ClusterNr].append(QueryLabel)
		
		cluster_label = QueryLabel.split("-")[0]
		
		if ClusterNr not in database:
			database[ClusterNr] = []
		if cluster_label not in database[ClusterNr]:
			gene_dict = {'gene':cluster_label, 'allele':QueryLabel}
			database[ClusterNr].append(gene_dict)
	
	max_cluster_size = 0
	largest_cluster = ""
	for cluster in full_database:
		cluster_size = len(full_database[cluster])
		if cluster_size > max_cluster_size:
			max_cluster_size = cluster_size
			largest_cluster = cluster
	
	all_genes = []
	multiple_genes = {}
	multiples_count = 0
	total_count = 0

	span_genes = []
	clusters_with_multiple_genes = []
		
	for cluster in database:
		total_count += 1
		cluster_genes = []
		
		for gene in database[cluster]:
			gene_name = gene['gene']
			
			if gene_name not in multiple_genes:
				multiple_genes[gene_name] = []
			multiple_genes[gene_name].append(cluster)
			
			if gene_name not in cluster_genes:
				cluster_genes.append(gene_name)
			
		if len(cluster_genes) > 1:
			multiples_count += 1
			clusters_with_multiple_genes.append(cluster)
		
	
	for gene in multiple_genes:
		unique_clusters = list(set(multiple_genes[gene]))
		if len(unique_clusters) > 1:
			span_genes.append(gene)
	
	genes_printed_to_file = []
	uniqueID = 0	
	for cluster in database:
		
		for gene in database[cluster]:
			
			if gene not in genes_printed_to_file:
				uniqueID += 1
				gene_name = gene['gene']
				allele = gene['allele']
				if cluster in clusters_with_multiple_genes:
					cluster_flag = "yes"
				else:
					cluster_flag = "no"
				if gene_name in span_genes:
					gene_flag = "yes"
				else:
					gene_flag = "no"
				
				outstring = "{0:05d},{1},{2},{3},{4},{5}\n".format(uniqueID, cluster, gene_name, allele, cluster_flag, gene_flag)
				outfile.write(outstring)
				genes_printed_to_file.append(gene)

	outfile.close()
	
	# Save all alleles with the same gene name to separate fasta files
	records_by_gene = {}
	for seq_record in SeqIO.parse(args.infasta_file, "fasta"):
		gene = seq_record.id.split("-")[0]
		if gene not in records_by_gene:
			records_by_gene[gene] = []
		records_by_gene[gene].append(seq_record)

	for gene in records_by_gene:
		SeqIO.write(records_by_gene[gene], (gene + ".fsa"), "fasta")
		


if __name__ == '__main__':
	sys.exit(main())