1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
#!/usr/bin/env bds
# Official gene names files
string[] versions=["GRCh37.66", "GRCh37.73"]
string genesHgnc="hgnc_complete_set.txt"
#---
# Create a mapping list of genes
#---
for( string ver : versions ) {
print("Creating gene list file: $ver\n")
string gtf="Homo_sapiens.$ver.gtf.gz"
string genesGrch = "genes.$ver.list"
task( genesGrch <- gtf ) {
sys gunzip -c $gtf | cut -f 9 | grep "gene_name" | tr "\";" "\t\t" | cut -f 2,11 | uniq | sort | uniq > $genesGrch
}
}
wait
string genes="genes.list"
sys join -a 1 genes.GRCh37.66.list genes.GRCh37.73.list | tr " " "\t" > $genes
string version = versions[0]
string gtf="Homo_sapiens.$version.gtf.gz"
string genesGrch = "genes.$version.list"
#---
# Create new gene set files and correct some other
#---
# Remove old versions of these sets
sys rm -f *.gmt
# Copy sets from MSigDb and other dirs
sys cp -vf MSigDB/*.gmt .
sys cp -vf ingenuity/*.gmt .
# Create 't2d' collection
sys cat mendelian.nancy/*.gmt t2d_genes.*/*.gmt | sort | uniq > t2d.gmt
# Create 'all' collection
sys cat *.gmt | sort | uniq > all
sys mv all all.gmt
# Create 'selected' collection
sys cat \
c2.cp.biocarta.v4.0.symbols.gmt \
c2.cp.kegg.v4.0.symbols.gmt \
c2.cp.reactome.v4.0.symbols.gmt \
c2.cp.v4.0.symbols.gmt \
c5.all.v4.0.symbols.gmt \
ingenuity.set.gmt \
t2d.gmt \
| sort \
| uniq \
> selected
sys mv selected selected.gmt
#---
# Correct gene names (if they are not foung in gene list)
print("Correcting gene names:\n")
for( string gmt : ".".dir(".*\.gmt") ) {
print("\t$gmt\n")
sys cat $gmt \
| ./checkGeneNames.py $genesHgnc $genes \
> tmp_$gmt \
2> report.$gmt.out
sys mv tmp_$gmt $gmt
}
|