1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
|
#!/usr/bin/env python
import sys
# Debug mode?
debug = False
#------------------------------------------------------------------------------
# Read genes file
#------------------------------------------------------------------------------
def readGenes(genesFile):
print >> sys.stderr, "Reading file " + genesFile
genes2new = {}
genes2old = {}
id2nameNew = {}
id2nameOld = {}
for line in open(genesFile) :
fields = line.rstrip().split("\t")
if debug: print fields
geneId, nameOld = fields[0], fields[1]
nameNew = ''
if len(fields) > 2: nameNew = fields[2]
if nameNew:
genes2new[nameOld] = nameNew
id2nameNew[id] = nameNew
if nameOld:
genes2old[nameNew] = nameOld
id2nameOld[id] = nameOld
return genes2new, genes2old, id2nameNew, id2nameOld
#------------------------------------------------------------------------------
# Read HGNC file: gene names, previous names and synonyms.
#------------------------------------------------------------------------------
def readHgcn(hgncFile):
print >> sys.stderr, "Reading file " + hgncFile
genesHgcn = {}
for line in open(hgncFile) :
fields = line.rstrip().split("\t")
if len(fields) < 8: continue
geneName, prevName, synonyms = fields[1], fields[6], fields[8]
if debug: print "{}\t|{}|\t|{}|".format(geneName, prevName, synonyms)
# Add all 'previous names'
for g in prevName.split(",") :
alias = g.strip()
if alias:
if alias in genesHgcn:
print >> sys.stderr, "Error: Alias '{}' already exists ( {} vs {} )!".format( alias, genesHgcn[alias], geneName )
else :
genesHgcn[alias] = geneName
if debug: print "\tPrev: |{}|".format( alias )
# Add all 'synonyms'
for g in synonyms.split(",") :
alias = g.strip()
if alias:
if alias in genesHgcn:
print >> sys.stderr, "Error: Alias '{}' already exists ( {} vs {} )!".format( alias, genesHgcn[alias], geneName )
else :
genesHgcn[alias] = geneName
if debug: print "\tSyn: |{}|".format( alias )
return genesHgcn
#------------------------------------------------------------------------------
# Find gene
#------------------------------------------------------------------------------
#def findGeneName(g, genes2new, genes2old, genesHgcn):
def findGeneName(g):
# Gene name found, no need to find a new name
if isValid(g, genes2new): return g
# Try translating the name using 'genes2old' dictionary
geneOld = genes2old.get(g, "")
if isValid(geneOld, genes2new): return geneOld
# Try an alias
geneHgcn = genesHgcn.get(g, "")
if isValid(geneHgcn, genes2new): return geneHgcn
# We have an alias, but it was not valid.
if geneHgcn:
# Try to find an 'old' name for the alias
geneNew = genes2old.get(geneHgcn, "")
if isValid(geneNew, genes2new): return geneNew
# Desperate attempt: Find a gene that matches
for gn in genes2new:
if gn.startswith(g): return gn
for gn in genes2old:
if gn.startswith(g): return genes2old[gn]
return ""
# Valid gene name (not empty and is in 'genes' dictionary)
def isValid(gname, genes):
if gname and (gname in genes): return True
return False
#------------------------------------------------------------------------------
# Main
#------------------------------------------------------------------------------
#---
# Parse command line
#---
if len(sys.argv) != 3:
print >> sys.stderr, "Usage: " + sys.argv[0] + " hgnc_complete_set.txt genes.list"
sys.exit(1)
hgncFile = sys.argv[1] # This argument is a Hugo File. Note: You can download the latest version from ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc_complete_set.txt.gz
genesFile = sys.argv[2] # This is a "geneId \t geneName" list created from a GTF file
# Read files
genes2new, genes2old, id2nameNew, id2nameOld = readGenes(genesFile)
genesHgcn = readHgcn(hgncFile)
#---
# Read all lines from STDIN
# Note: This is counter intuitive because we are trying to
# replace 'new' names with 'old' names (and not the
# other way arround which is what you'd expect)
#---
for line in sys.stdin:
f = line.rstrip().split('\t')
geneSet = f[0]
genesNames = f[2:]
# Check that each gene has a valid geneID
missing = ""
missingCount = 0
foundAlias = 0
out = "{}\t{}".format(geneSet, f[1]);
for g in genesNames :
geneOld = findGeneName(g)
if not geneOld:
# No valid replacement found
missing += "\t\t'{}'\n".format(g)
missingCount += 1
elif g != geneOld:
# Replacement found
missingCount += 1
foundAlias += 1
missing += "\t\t'{}'\t->\t'{}'\n".format(g, geneOld)
# Add only if there is a gene name (skip if no replacement has been found)
if geneOld : out += "\t" + geneOld
# Show line (names have been replaced)
print out
if missingCount > 0 :
total = (len(f) - 2)
missingPerc = 100.0 * missingCount / total
print >> sys.stderr, "{}\n\tMissing : {} ( {:.1f}% )\n\tTotal : {}\n\tReplaced: {}\n\tGenes ( -> Replacement ) :\n{}".format(geneSet, missingCount, missingPerc, total, foundAlias, missing)
|