1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
|
from collections import defaultdict
d = "./"
mgiID2MRK_ENSE = {}
enseID2mgiID = {}
with open(d + "inputs/MRK_ENSEMBL.rpt") as f:
for line in f:
items = line[:-1].split("\t")
mgiID = items[0]
mgiID2MRK_ENSE[mgiID] = items
enseID = items[5]
enseID2mgiID[enseID] = mgiID
enseID2MGI_GMC = {}
with open(d + "inputs/MGI_Gene_Model_Coord.rpt") as f:
MGI_GMC_headers = f.readline()[:-1].split("\t")
for line in f:
items = line[:-1].split("\t")
enseID = items[10]
enseID2MGI_GMC[enseID] = items
mgiID2MRK_Seq = {}
with open(d + "inputs/MRK_Sequence.rpt") as f:
MRK_Seq_headers = f.readline()[:-1].split("\t")
for line in f:
items = line[:-1].split("\t")
mgiID = items[0]
mgiID2MRK_Seq[mgiID] = items
enseID2mart = {}
with open(d + "inputs/mart_export.txt") as f:
mart_headers = f.readline()[:-1].split("\t")
for line in f:
items = line[:-1].split("\t")
enseID = items[0]
enseID2mart[enseID] = items
geneSymbol2TF = {}
with open(d + "inputs/TF_TcoF-DB.tsv") as f:
TF_headers = f.readline()[:-1].split("\t")
for line in f:
items = line[:-1].split("\t")
geneSymbol = items[0]
geneSymbol2TF[geneSymbol] = items
geneSymbol2Regulated = defaultdict(list)
with open(d + "inputs/trrust_rawdata.mouse.tsv") as f:
for line in f:
items = line[:-1].split("\t")
TFSymbol = items[0]
geneSymbol2Regulated[TFSymbol].append(items[1])
with open(d + "gencode.vM23.metadata.tab", "w") as fout:
fout.write("\t".join([
"Accession",
"AccessionVersion",
"Gene",
"FullName",
"GeneType",
"HgncID",
"Chromosome",
"Strand",
"ChromosomeStart",
"ChromosomeEnd",
"LocusGroup",
"LocusType",
"Location",
"LocationSortable",
"Aliases",
"VegaID",
"UcscID",
"RefseqID",
"CcdsID",
"UniprotID",
"PubmedID",
"MgdID",
"RgdID",
"CosmicID",
"OmimID",
"MirBaseID",
"IsTFi (TcoF-DB)",
"DnaBindingDomain",
"Regulates (TRRUST)"
]))
fout.write("\n")
with open(d + "inputs/gencode.vM23.primary_assembly.annotation.gtf") as f:
for line in f:
if line.startswith("##"):
continue
items = line[:-1].split("\t")
if items[2] != "gene":
continue
extra = {x.strip().split(" ")[0]: x.strip().split(" ")[1].strip('"') for x in items[8].split(";")[:-1]}
enseID = extra["gene_id"].split(".")[0]
geneSymbol = extra.get("gene_name", "")
fout.write("\t".join([
enseID,
extra["gene_id"],
geneSymbol,
enseID2MGI_GMC[enseID][3] if enseID in enseID2MGI_GMC else "", # full name
extra["gene_type"], # gene type from gencode
"", # HGNC id
items[0], # Chromosome
items[6],
items[3], # Start
items[4], # End
"", # Locus group
mgiID2MRK_ENSE[mgiID][8], # Locus type
"", # Location
"", # Location, sortable
"", # Aliases
enseID2mart[enseID][5] if enseID in enseID2mart else "", # VEGA id
enseID2mart[enseID][4] if enseID in enseID2mart else "", # UCSC id
mgiID2MRK_Seq[mgiID][12], # Refseq id
enseID2mart[enseID][6] if enseID in enseID2mart else "", # CCDS id
mgiID2MRK_Seq[mgiID][14], # Uniprot id
"", # Pubmed id
"", # MGD id
"", # RGD id
"", # COSMIC id
"", # OMIM id
"", # MIRbase id
"True" if (geneSymbol in geneSymbol2TF) else "False", # IsTF?
"", # DBD
",".join(geneSymbol2Regulated[geneSymbol]) # TF regulated genes
]))
fout.write("\n")
|