1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
__doc__="""
pliu 20131002
module for gene
"""
class Gene:
def __init__(self):
self.gene_id = None;
#self.rsem_result = None;
self.chrom = None;
self.strand = None;
self.tss = None; ## transcription starting sites, strand ori considered
self.tes = None; ## transcription ending sites, strand ori considered
self.start = None; ## genomic starting position regardless of strand
## direction, always have a number smaller than self.end
self.end = None; ## genomic ending position
self.transcripts = [];
self.gtfs = [];
self.transcript_tss_groups = []; ## a list of list of transcripts having TSS
## within user-specified distance
self.transcript_groups = [] ## a list of TranscriptionGroup objects
def __str__(self):
s = "%s" % self.gene_id;
return s;
## should be moved to TranscriptGroup file
#def groupTranscriptsByTSS(self):
# """
# put transcripts that have TSS within certain distance into a group
# """
# #import Param;
# #cutoff = 100; ## TSS within 100 bp
# #cutoff = Param.TSS_GROUP_CUTOFF; ## cutoff for grouping TSS
# cutoff = 500 ## cutoff for grouping TSS
# group = [self.transcripts[0]]
# self.transcript_tss_groups.append(group)
# for tr in self.transcripts[1:]:
# is_assigned = False
# for grp in self.transcript_tss_groups:
# if (self.strand == '+') and (abs(tr.start - grp[0].start)<=cutoff):
# grp.append(tr)
# is_assigned = True;
# elif (self.strand == '-') and (abs(tr.end - grp[0].end)<=cutoff):
# grp.append(tr)
# is_assigned = True
# if is_assigned:
# break
# if not is_assigned:
# self.transcript_tss_groups.append([tr])
## should be moved to TranscriptGroup file
#def constructTranscriptGroups(self):
# """
# construct a list of TranscriptGroup objects
# """
# import TranscriptGroup
# if len(self.transcript_tss_groups) == 0:
# self.groupTranscriptsByTSS()
# for transcripts in self.transcript_tss_groups:
# grp = TranscriptGroup.TranscriptGroup()
# grp.chrom = self.chrom
# grp.gene_id = self.gene_id
# grp.strand = self.strand
# grp.transcripts = transcripts
# self.transcript_groups.append(grp)
def getStartEndTSSTESFromTranscripts(self):
"""
define start and end from gene's transcripts
start = min{all starts for transcripts};
end = max{all ends for transcripts};
"""
starts = [tr.start for tr in self.transcripts];
ends = [tr.end for tr in self.transcripts];
self.start = min(starts);
self.end = max(ends);
if self.strand == '+':
self.tss = self.start;
self.tes = self.end;
elif self.strand == '-':
self.tss = self.end;
self.tes = self.start;
def definePeakTypeByTranscriptGroups(self):
"""
all: all its transcript groups have peaks
none: none of its transcript groups has peak
mixed: some of its transcript groups have peaks, the others do not
"""
has_tss_peaks = [grp.has_peak_around_TSS for grp in self.transcript_groups]
if all(has_tss_peaks): ## all groups have peaks
self.peak_type = 'all'
else:
if any(has_tss_peaks): ## some groups have peaks, the others not
self.peak_type = 'mixed'
else: ## no group has peak
self.peak_type = 'no'
def constructGenesFromTranscripts(transcripts):
"""
return a list of genes constructed from input transcripts
"""
genes = []
gene_dict_id = {}
for tr in transcripts:
if tr.gene_id in gene_dict_id:
gene_dict_id[tr.gene_id].transcripts.append(tr)
tr.gene = gene_dict_id[tr.gene_id]
else:
gene = Gene()
gene.gene_id = tr.gene_id
gene.chrom = tr.chrom
gene.strand = tr.strand
gene.transcripts.append(tr)
genes.append(gene)
gene_dict_id[tr.gene_id] = gene
tr.gene = gene
list(map(lambda gene: gene.getStartEndTSSTESFromTranscripts(), genes));
return genes;
|