File: sqt-go-enrichment

package info (click to toggle)
python-sqt 0.8.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 824 kB
  • sloc: python: 5,964; sh: 38; makefile: 10
file content (326 lines) | stat: -rwxr-xr-x 11,186 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#!/usr/bin/env python
""" Find enrichments of GO terms for a ranked list of genes. """

from __future__ import print_function, division

import sys
import re
import csv
import textwrap
import argparse
from itertools import product, islice
from collections import defaultdict
from functools import lru_cache
import numpy as np
import math
from scipy.misc import comb as spcomb

def argument_parser():
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("obo", help="The gene ontology in .obo format.")
	parser.add_argument("genelist", nargs="?", default=sys.stdin, help="A ranked list of genes. Each row should contain a comma separated list of GO terms the gene is contained in.")
	parser.add_argument("--draw-graph", "-g", help="Draw the graph of enriched GO terms.")
	parser.add_argument("--max-partition", "-m", type=int, metavar="M", help="Consider only the first M partitions when calculating an enrichment of a GO term against the ranked genelist.")
	parser.add_argument("--fixed-partition", "--forground", type=int, metavar="M", help="Consider the first M M genes as foreground and the rest as background when calculating an enrichment of a GO term against the ranked genelist.")
	parser.add_argument("--max-pvalue", "-p", type=float, metavar="P", default=0.001, help="Maximum uncorrected p-value to report an enrichment.")
	parser.add_argument("--usefdr", action="store_true", help="Use FDR for p-value cutoff (see above).")
	parser.add_argument("--fast", "-f", action="store_true", help="Calculate an upper bound rather than the exact p-value.")
	return parser

def comb(n, k, spcomb=spcomb):
	return spcomb(n, k, exact=True)

############## Enrichment ################
# based on Eden et al. "GOrilla: a tool for discovery and visualization of enriched GO terms in ranked gene lists", BMC Bioinformatics 2009


def hypergeometric_tail(N, B, n, b, comb=comb):
	"""
	Computes HGT(b;N,B,n)

	N -- number of overall genes
	B -- size of the GO term
	n -- number of tested genes
	b -- number of tested genes associated with the GO term
	"""
	NB = comb(N,B)
	hgt = sum(comb(n, i) * comb(N - n, B - i) for i in range(b, min(n, B) + 1)) / NB
	return hgt

def tails(N, B, interm, max_partition, hypergeometric_tail=hypergeometric_tail):
	b = 0
	yield hypergeometric_tail(N, B, 1, interm[0]), 1, interm[0] # yield the first element
	for n in range(2, min(len(interm), max_partition) + 1):
		isin = interm[n-1]
		if isin: # only need to compute the tail if b increases, else hgt will become only bigger and we need the minimum
			b += 1
			yield hypergeometric_tail(N, B, n, b), n, b

def min_hypergeometric_tail(N, B, interm, max_partition, tails=tails):
	"""
	Computes mHGT(lambda)

	N -- number of overall genes
	B -- size of the GO term
	interm -- the vector lambda stating (with a boolean value True) for gene i in the list whether it is contained in the considered GO term
	"""
	return min(tails(N, B, interm, max_partition), key=lambda item: item[0])

def not_visiting_paths(N, B, R):
	"""
	Computes the number of paths not visiting R (PI_R(N,B)) by dynamic programming.
	"""
	# we shift b by 1 to allow for b=-1 at the index of 0, i.e. the real b equals b - 1
	#P = np.zeros((N+1, B+2), dtype=np.uint)
	P = [[0 for b in range(B+2)] for n in range(N+1)]
	P[0][1] = 1
	for n in range(1, N + 1):
		for b in range(max(1, B + 1 - N + n), min(B + 1, n + 1) + 1):
			if (n,b-1) in R:
				P[n][b] = 0
			else:
				P[n][b] = P[n-1][b] + P[n-1][b-1]
	ret = P[N][B+1] # i.e. PI[N,B] in the paper
	return ret

def R(N, B, mhgt, comb=comb):
	"""
	Points n,b in the Grid N,B where HGT(b;N,B,n) <= mhgt
	"""
	validpoints = set()
	NB = comb(N,B)
	for n in range(1, N+1):
		hgt = 0
		for b in reversed(range(max(0, B - N + n), min(B, n) + 1)):
			hgt += comb(n, b) * comb(N - n, B - b) / NB # calc the next element of the hypergeometric tail sum
			if hgt <= mhgt:
				validpoints.add((n,b))
			else:
				break
	return validpoints

def pvalue(N, B, mhgt, not_visiting_paths=not_visiting_paths, comb=comb):
	"""
	Calculate the p-value for the given minimum Hypergeometric Tail score mhgt.
	Then the pvalue is the probability to see a score <= mhgt given N genes in total
	and B genes in the GO term.
	"""
	nvp = not_visiting_paths(N, B, R(N, B, mhgt))
	NB = comb(N,B)
	return (NB - nvp) / NB

############### GO Parser ###################

def parse_obo(obofile):
	with open(obofile) as obofile:
		for l in obofile:
			if l.startswith("[Term]"):
				goterm = GOTerm(obofile)
				#if goterm.id == 1709:
				yield goterm

class GOTerm:
	_byid = dict()

	@classmethod
	def byid(cls, id):
		if isinstance(id, str):
			return cls._byid[goid(id)]
		return cls._byid[id]

	def __init__(self, obofile):
		values = self.parse_goterm(obofile)
		self.id = goid(values["id"][0])
		self.name = values["name"][0]
		self.namespace = values["namespace"][0]
		self.definition = values["def"][0]
		self.subset = values["subset"][0] if values["subset"] else None
		self.is_a = list(map(goid, values["is_a"]))
		self._byid[self.id] = self

	@staticmethod
	def parse_goterm(obofile):
		values = defaultdict(list)
		regex = re.compile("(?P<key>\w+): (?P<value>[^!]+)")
		for l in obofile:
			if l == "\n":
				break
			match = re.match(regex, l)
			values[match.group("key")].append(match.group("value").strip())
		return values

	def __repr__(self):
		return "GO:{:07} {}".format(self.id, self.name)

def goid(idstring):
	return int(idstring[3:])

################ Genelist parser ##############

def parse_genelist(genelist):
	for l in csv.reader(genelist, delimiter="\t"):
		if len(l) > 1 and l[1]:
			yield Gene(l[0], l[1:])

class Gene:
	def __init__(self, id, goterms):
		self.id = id
		self.goterms = set(map(goid, goterms))

	def interm(self, goterm):
		return goterm.id in self.goterms

################ process data #################

def calc_fdrs(pvalues, sortedindex, n):
	"""
	Calculate FDR with the algorithm of Benjamini-Hochberg as implemented in the R package multtest.
	From Benjamini-Hochberg, 1995:
	let k be the largest i for which
	P_i <= i / n * (q*)
	then reject all H_i for i = 1,...,k. Thereby, above procedure controls the false discovery rate at q*.
	In other words, the false discovery rate FDR_i for P_i is
	P_i * n / i <= FDR_i .
	"""
	fdr = np.empty_like(pvalues)
	if n:
		fdr[sortedindex[n-1]] = pvalues[sortedindex[n-1]]
		for i in reversed(range(n-1)):
			fdr[sortedindex[i]] = min(fdr[sortedindex[i+1]], pvalues[sortedindex[i]] * (n / (i+1)), 1)
			assert fdr[sortedindex[i]] >= pvalues[sortedindex[i]]
	return fdr

def calc_interm(goterm, genelist):
	return [goterm.id in gene.goterms for gene in genelist]

def is_hit(interm, max_partition = None):
	if max_partition is None:
		max_partition = len(interm)
	return sum(interm[:max_partition]) >= 1

def calc_enrichments(goterms, genelist, max_partition = None, fixed_partition = None, max_pvalue = 0.001, fast = False):
	N = len(genelist)
	max_partition = min(len(genelist), N) if max_partition is None else max_partition
	pvalues = np.ones(len(goterms))
	interms = []
	params = []
	print("test", file=sys.stderr)
	for i, goterm in enumerate(goterms):
		interm = calc_interm(goterm, genelist)
		interms.append(interm)
		B = sum(interm)
		if is_hit(interm, max_partition=max_partition):
			if not fixed_partition is None:
				# TODO pvalue in this case has to be computed differently
				n = fixed_partition
				b = sum(interm[:fixed_partition])
				mhgt = hypergeometric_tail(N, B, n, b)
			else:
				mhgt, n, b = min_hypergeometric_tail(N, B, interm, max_partition)

			if mhgt < max_pvalue: # use lower bound of p-value as in Eden et al. Plos Comp. Biol. 2007 to omit unnecessary computations
				if fast:
					pvalues[i] = B * mhgt
				else:
					pvalues[i] = pvalue(N, B, mhgt)
				assert mhgt - pvalues[i] <= 0.0001
				assert pvalues[i] - B * mhgt <= 0.0001
		else:
			n, b = 0, 0 # no hit in possible partitions
		params.append((N, B, n, b))
		print(i, "of", len(goterms), "done", file=sys.stderr)

	return pvalues, interms, params

def significant_indices(sortedindex, hits, pvalues, max_pvalue):
	return set(i for i in islice(sortedindex, hits) if pvalues[i] <= max_pvalue)

################## drawing ###################

def collect_terms(goterms, significant):
	visited = set(significant)
	queue = list(visited)
	parents = dict()
	while queue:
		goterm = queue.pop(0)
		parents[goterm] = list(map(GOTerm.byid, goterm.is_a))
		for parent in parents[goterm]:
			if parent not in visited:
				visited.add(parent)
				queue.append(parent)
	return visited, parents

def draw_terms(outfile, goterms, pvalues, fdrs, params, significant, usefdr, maxpvalue):
	stat = pvalues if not usefdr else fdrs
	with open(outfile, "w") as dot:
		dot.write("digraph enrichment {\n")
		dot.write("node [shape=Mrecord,style=filled];")
		significant = set(goterms[i] for i in significant)
		visited, parents = collect_terms(goterms, significant)
		for i, goterm in enumerate(goterms):
			if goterm not in visited:
				continue

			if goterm in significant:
				pval = "\\npvalue: " if not usefdr else "\\nfdr: "
				pval += "{:.2e}".format(stat[i])
				saturation = 1 - stat[i] / maxpvalue
			else:
				saturation = 0
				pval = ""

			dot.write("{}[label=\"{}{}\",fillcolor=\"0.0 {} 1.0\"];\n".format(goterm.id, "\\n".join(textwrap.wrap(str(goterm), 30)), pval, saturation))
			for parent in parents[goterm]:
				dot.write("{} -> {};\n".format(parent.id, goterm.id))
		dot.write("}")




def main():
	parser = argument_parser()
	args = parser.parse_args()
	#import yappi
	#yappi.start()

	if args.genelist == sys.stdin:
		genelist = list(parse_genelist(args.genelist))
	else:
		with open(args.genelist) as f:
			genelist = list(parse_genelist(f))
	goterms = list(parse_obo(args.obo))

	pvalues, interms, params = calc_enrichments(goterms, genelist, max_partition=args.max_partition, fixed_partition=args.fixed_partition, max_pvalue = args.max_pvalue, fast = args.fast)
	hits = sum(is_hit(interm) for interm in interms)
	sortedindex = sorted(range(len(pvalues)), key=pvalues.__getitem__)
	fdrs = calc_fdrs(pvalues, sortedindex, hits)

	if args.usefdr:
		significant = significant_indices(sortedindex, hits, fdrs, args.max_pvalue)
	else:
		significant = significant_indices(sortedindex, hits, pvalues, args.max_pvalue)

	if args.draw_graph:
		draw_terms(args.draw_graph, goterms, pvalues, fdrs, params, significant, args.usefdr, args.max_pvalue)

	print("goterm\tp-value\tfdr\ttotal num of genes\tgenes in GO term\tnum of genes in partition\tnum of genes in partition and GO term\tgenes")
	for i in islice(sortedindex, hits):
		N, B, n, b = params[i]
		if i in significant:
			genes = (genelist[j].id for j, isin in islice(enumerate(interms[i]), n) if isin)
			print("{goterm}\t{pvalue}\t{fdr}\t{params}\t{genes}".format(goterm=goterms[i], pvalue=pvalues[i], fdr=fdrs[i], params="\t".join(map(str, (N,B,n,b))), genes="\t".join(genes)))
	#with open("profile.txt", "w") as out:
	#	yappi.print_stats(out=out, sort_type=2)


def test():
	N = 330
	B = 30
	mhgt = 0.0000001
	while mhgt < 1:
		assert mhgt <= pvalue(N, B, mhgt) <= B*mhgt
		print(mhgt, pvalue(N, B, mhgt), B*mhgt)
		mhgt *= 10

if __name__ == "__main__":
	main()