File: taxonomy.py

package info (click to toggle)
microbegps 1.0.0-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 166,876 kB
  • sloc: python: 2,786; makefile: 12
file content (182 lines) | stat: -rw-r--r-- 5,197 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# -*- coding: utf-8 -*-
"""
NCBI taxonomy functions used by MicrobeGPS
"""


# -------------- Basic functions ----------------------------------------------

def parse_nodes_dmp(node_file):
	""" parse a NCBI nodes.dmp file and extract the parental information (nodes) 
	and taxonomic rank information (ranks) for each taxid """
	nodes = dict()
	ranks = dict()
	for line in node_file:
		fields = line.rstrip('\t|\n').split('\t|\t')
		if len(fields) < 4:
			print 'skipped line',line
			continue
		taxid = int(fields[0])
		parent_id = int(fields[1])
		rank = fields[2]
		nodes[taxid] = parent_id
		ranks[taxid] = rank
	return nodes,ranks


def parse_names_dmp(names_file):
	""" """
	taxid_to_name = dict()
	for line in names_file:
		fields = line.rstrip('\t|\n').split('\t|\t')
		if len(fields) != 4:
			print 'skipped line',line
			continue
		taxid = int(fields[0])
		name = fields[1]
		is_scientific = fields[3] == 'scientific name'
		if not taxid in taxid_to_name:
			taxid_to_name[taxid] = name
		elif is_scientific:
			taxid_to_name[taxid] = name
	return taxid_to_name

def parse_catalog(catalog_file):
	""" """
	gi_to_taxid = dict()
	for line in catalog_file:
		fields = line.rstrip().split('\t')
		gi = fields[3]
		taxid = int(fields[0])
		gi_to_taxid[gi] = taxid
	return gi_to_taxid


def gis_to_taxid_and_name(gis, catalog):
	""" Resolve taxid and name of gi using NCBI Refseq catalog.
	gis is a list of (string) gi. catalog is the name of the catalog file. """
	gi_to_taxid = dict()
	gi_to_name = dict()
	for line in catalog:
		fields = line.rstrip().split('\t')
		gi = fields[3]
		taxid = int(fields[0])
		name = fields[1]
		if gi in gis:
			gi_to_taxid[gi] = taxid
			gi_to_name[gi] = name
	return gi_to_taxid,gi_to_name
	
	
def get_lineage(taxid, nodes):
	""" Calculate the lineage of a taxid down to the root of the phylogenetic
	tree. The root is assumed to have taxid 1, nodes is a dict storing the 
	parent taxid for each given taxid."""
	lineage = [taxid]
	while True:
		current = lineage[-1]
		parent = nodes.get(current,None)
		lineage.append(parent)
		if not parent or parent == 1:
			break
	if not lineage[-1] == 1:
		return None
	else:
		return lineage


def find_lowest_common_ancestor(taxids, nodes):
	""" find the lowest common ancestor for given taxids using the hierarchy
	represented by the nodes dict """
	lca = None
	for tid in taxids:
		if not tid in nodes:
			continue
		lineage = get_lineage(tid,nodes)
		if not lineage:
			continue
		if not lca:
			lca = lineage
		else:
			# go backwards through lineage and current lca and compare
			for i in range(min(len(lineage),len(lca))):
				if lca[-(i+1)] != lineage[-(i+1)]:
					# shorten lineage if disagreement is found
					lca = lca[-i:]
					break
	return lca
	
	
def find_lowest_common_ancestor_name(taxids, nodes, names):
	""" find the lowest common ancestor for given taxids using the hierarchy
	represented by 'nodes'. The name of the LCA is returned using 'names'."""
	lca = find_lowest_common_ancestor(taxids,nodes)
	if lca:
		return names.get(lca[0],None)
	else:
		return None
		

def candidates_to_LCA_tree(groups, nodes, names, outdir):
	# first calculate the LCA for each candidate
	lca_list = []
	num_reads = []
	for grp in groups:
		# collect all taxids
		taxids = [m.name for m in grp.members.itervalues()]
		lca = find_lowest_common_ancestor(taxids, nodes)
		if lca:
			lca_list.append(lca)
			if lca[-4] == 1224: # Proteobacteria, there are many of them
				num_reads.append([str(lca[0]),names[lca[-5]],str(grp.reads)])
			else:
				num_reads.append([str(lca[0]),names[lca[-4]],str(grp.reads)])
	
	# build the tree
	tree = dict()
	for lca in lca_list:
		parent = tree
		for tid in lca[::-1]:
			if tid == lca[0]:
				leaves = parent.setdefault('leaves',set())
				leaves.add(str(tid))
			else:
				parent = parent.setdefault(tid, dict())

	# convert tree to newick string
	def report_leaves(subtree):
		if len(subtree) == 0:
			return ''
		else:
			subtree_string = ''
			if len(subtree) > 1:
				subtree_string = '('
			for key in subtree:
				if key == 'leaves':
					subtree_string += ','.join(subtree[key])+','
				else:
					subtree_string += report_leaves(subtree[key]) + ','
			subtree_string = subtree_string[:-1]
			if len(subtree) > 1:
				subtree_string += ')'
			return subtree_string
	newick_string = report_leaves(tree)
	
	tree_file = open(outdir+'/iTOL_tree.txt','w')
	tree_file.write(newick_string)
	tree_file.close()
	
	read_file = open(outdir+'/iTOL_reads.txt','w')
	for line in num_reads:
		read_file.write( '%s\t%s\n'%(line[0],line[2]) )
	read_file.close()

	color_map = {'Actinobacteria':'#99CCFF','Alphaproteobacteria':'#66FF66','Bacteroidetes/Chlorobi group':'#FF9933','Betaproteobacteria':'#33CC33','Chlamydiae/Verrucomicrobia group':'#FF6699','Chloroflexi':'#FF8C8C','Cyanobacteria':'#99D6D6','delta/epsilon subdivisions':'#003300','Fibrobacteres/Acidobacteria group':'#FFFF40','Firmicutes':'#94FF70','Gammaproteobacteria':'#009933','Nitrospirae':'#9C9CBD','Planctomycetes':'#8F8FE3'}
	color_file = open(outdir+'/iTOL_colorstrips.txt','w')
	for line in num_reads:
		color_file.write( '%s\t%s\n'%(line[0],color_map.get(line[1],'#AAAAAA')) )
	color_file.close()