1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
|
#!/usr/bin/python3
# encoding: utf-8
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import os, sys, re
import logging
import argparse
import collections
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
logger = logging.getLogger(__file__)
def main():
parser = argparse.ArgumentParser(description="estimates gene length as isoform lengths weighted by TPM expression values")
parser.add_argument("--gene_trans_map", dest="gene_trans_map_file", type=str, default="",
required=True, help="gene-to-transcript mapping file, format: gene_id(tab)transcript_id")
parser.add_argument("--trans_lengths", dest="trans_lengths_file", type=str, required=True,
help="transcript length file, format: trans_id(tab)length")
parser.add_argument("--TPM_matrix", dest="TPM_matrix_file", type=str, default="",
required=True, help="isoform TPM expression matrix")
parser.add_argument("--debug", required=False, action="store_true", default=False, help="debug mode")
args = parser.parse_args()
if args.debug:
logger.setLevel(logging.DEBUG)
trans_to_gene_id_dict = parse_gene_trans_map(args.gene_trans_map_file)
trans_lengths_dict = parse_trans_lengths_file(args.trans_lengths_file)
trans_to_TPM_vals_dict = parse_TPM_matrix(args.TPM_matrix_file)
weighted_gene_lengths = compute_weighted_gene_lengths(trans_to_gene_id_dict,
trans_lengths_dict,
trans_to_TPM_vals_dict)
print("#gene_id\tlength")
for gene_id,length in weighted_gene_lengths.items():
print("\t".join([gene_id,str(length)]))
sys.exit(0)
def compute_weighted_gene_lengths(trans_to_gene_id_dict, trans_lengths_dict, trans_to_TPM_vals_dict):
gene_id_to_trans_list = collections.defaultdict(list)
gene_id_to_length = {}
pseudocount = 1
for trans_id,gene_id in trans_to_gene_id_dict.items():
gene_id_to_trans_list[gene_id].append(trans_id)
for gene_id,trans_list in gene_id_to_trans_list.items():
if len(trans_list) == 1:
gene_id_to_length[gene_id] = trans_lengths_dict[ trans_list[0] ]
else:
sum_length_x_expr = 0
sum_expr = 0
trans_expr_lengths = []
for trans_id in trans_list:
trans_len = trans_lengths_dict[trans_id]
expr_vals = trans_to_TPM_vals_dict[trans_id]
trans_sum_expr = sum(expr_vals) + pseudocount
trans_expr_lengths.append((trans_len, trans_sum_expr))
sum_length_x_expr += trans_sum_expr * trans_len
sum_expr += trans_sum_expr
weighted_gene_length = sum_length_x_expr / sum_expr
gene_id_to_length[gene_id] = int(round(weighted_gene_length))
logger.debug("Computing weighted length of {0}: {1} => {2}".format(gene_id,
trans_expr_lengths,
weighted_gene_length))
return gene_id_to_length
def parse_TPM_matrix(TPM_matrix_file):
trans_to_TPM_vals_dict = {}
with open(TPM_matrix_file) as f:
header = next(f)
for line in f:
line = line.rstrip()
vals = line.split("\t")
trans_id = vals[0]
expr_vals_list = vals[1:]
expr_vals_list = [float(x) for x in expr_vals_list]
trans_to_TPM_vals_dict[trans_id] = expr_vals_list
return trans_to_TPM_vals_dict
def parse_trans_lengths_file(trans_lengths_file):
trans_id_to_length = {}
with open(trans_lengths_file) as f:
for line in f:
line = line.rstrip()
if line[0] == '#':
continue
(trans_id, length) = line.split("\t")
if re.match("^\d+$", length):
trans_id_to_length[trans_id] = int(length)
else:
print("Warning - ignoring line: [{0}] since not parsing length value as number".format(line), file=sys.stderr)
return trans_id_to_length
def parse_gene_trans_map(gene_trans_map_file):
trans_to_gene_id = {}
with open(gene_trans_map_file) as f:
for line in f:
line = line.rstrip()
(gene_id, trans_id) = line.split("\t")
trans_to_gene_id[trans_id] = gene_id;
return trans_to_gene_id
####################
if __name__ == "__main__":
main()
|