File: create_sets.bds

package info (click to toggle)
snpeff 5.4.b%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 757,496 kB
  • sloc: java: 62,572; perl: 2,279; sh: 1,185; python: 744; xml: 507; makefile: 50
file content (72 lines) | stat: -rwxr-xr-x 1,667 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env bds

# Official gene names files
string[] versions=["GRCh37.66", "GRCh37.73"]
string genesHgnc="hgnc_complete_set.txt"

#---
# Create a mapping list of genes
#---
for( string ver : versions ) {
	print("Creating gene list file: $ver\n")
	string gtf="Homo_sapiens.$ver.gtf.gz"
	string genesGrch = "genes.$ver.list"
	task( genesGrch <- gtf ) {
		sys gunzip -c $gtf | cut -f 9 | grep "gene_name" | tr "\";" "\t\t" | cut -f 2,11 | uniq | sort | uniq > $genesGrch
	}
}
wait

string genes="genes.list"
sys join -a 1 genes.GRCh37.66.list genes.GRCh37.73.list | tr " " "\t" > $genes

string version = versions[0]
string gtf="Homo_sapiens.$version.gtf.gz"
string genesGrch = "genes.$version.list"

#---
# Create new gene set files and correct some other
#---

# Remove old versions of these sets
sys rm -f *.gmt

# Copy sets from MSigDb and other dirs
sys cp -vf MSigDB/*.gmt .
sys cp -vf ingenuity/*.gmt .

# Create 't2d' collection
sys cat mendelian.nancy/*.gmt t2d_genes.*/*.gmt | sort | uniq > t2d.gmt 

# Create 'all' collection
sys cat *.gmt | sort | uniq > all
sys mv all all.gmt

# Create 'selected' collection
sys cat  \
	c2.cp.biocarta.v4.0.symbols.gmt \
	c2.cp.kegg.v4.0.symbols.gmt \
	c2.cp.reactome.v4.0.symbols.gmt \
	c2.cp.v4.0.symbols.gmt \
	c5.all.v4.0.symbols.gmt \
	ingenuity.set.gmt \
	t2d.gmt \
	| sort \
	| uniq \
	> selected
sys mv selected selected.gmt

#---
# Correct gene names (if they are not foung in gene list)
print("Correcting gene names:\n")
for( string gmt : ".".dir(".*\.gmt") ) {
	print("\t$gmt\n")

	sys cat $gmt \
			| ./checkGeneNames.py $genesHgnc $genes \
			> tmp_$gmt \
			2> report.$gmt.out

	sys mv tmp_$gmt $gmt 
}