File: Transcript.py

package info (click to toggle)
rsem 1.3.3%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 37,700 kB
  • sloc: cpp: 19,230; perl: 1,326; python: 1,245; ansic: 547; makefile: 186; sh: 154
file content (189 lines) | stat: -rw-r--r-- 6,030 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
__doc__="""

  peng 20131009

  Data structure copied from RSEM, made some name changes
"""


class Transcript:
  def __init__(self):
    self.transcript_id = None
    self.gene_id = None
    self.gene    = None
    self.transcript_group = None

    self.chrom   = None  ## RSEM Transcript's string seqname
    self.strand  = None
    self.length  = None
    self.exon_ranges = [];  ## RSEM Transcript's vector<Interval> structure
    self.gtf_attr = {}; ## RSEM Transcript's string left
    self.gtf_additional_info = None;

    self.start = None; ## genomic starting postion,
                       ## regardless of strand direction
                       ## always have a number smaller than self.end
    self.end = None;   ## genomic ending postion

    self.tss = None;   ## genomic coordinate of transcription starting site
    self.tes = None;   ## genomic coordinate of transcription ending site

    ## mappability
    self.ave_mpp_around_TSS = None  ## [TSS-flanking_width, TSS+flanking_width]
    self.ave_mpp_around_body = None ## (TSS+flanking_width, TES-flanking_width)
    self.ave_mpp_around_TES = None  ## [TES-flanking_width, TES+flanking_width]


  def __str__(self):
    s = "%s\n%s\n%s\n%s %d\n" % (self.transcript_id, self.gene_id, self.chrom,
      self.strand, self.length);
    s += "%d" % len(self.exon_ranges);
    for (start, end) in self.exon_ranges:
      s += " %d %d" % (start, end);
    s += "\n";
    for key in list(self.gtf_attr.keys()):
      for val in self.gtf_attr[key]:
        s += '%s "%s"; ' % (key, val);
    s = s.rstrip();
    return s;


  def constructFromRSEMTI(self, ti_lines):
    """
    construct Transcript from the 6 lines from RSEM .TI file
    """
    self.quicklyConstructFromRSEMTI(ti_lines);

    feature_words = ti_lines[5].rstrip(';').split(';');
    for feature_word in feature_words:
      feature_word.lstrip();
      (key, val) = feature_word.split();
      if key not in self.gtf_attr:
        self.gtf_attr[key] = [];
      self.gtf_attr[key].append(val.strip('"'));


  def quicklyConstructFromRSEMTI(self, ti_lines):
    """
    quickly construct Transcript from the 6 lines from RSEM .TI file, the last
    line won't be parsed.
    """
    self.transcript_id = ti_lines[0].split("\t")[0]
    self.gene_id = ti_lines[1].split("\t")[0]
    self.chrom = ti_lines[2];
    (self.strand, self.length) = ti_lines[3].split();
    self.length = int(self.length);
    words = ti_lines[4].split();
    for j in range(0, int(words[0])):
      start = int(words[j*2+1]);
      end = int(words[j*2+2]);
      self.exon_ranges.append( (start, end) );

    self.start = self.exon_ranges[0][0];
    self.end = self.exon_ranges[-1][-1];
    if self.strand == '+':
      self.tss = self.start
      self.tes = self.end
    elif self.strand == '-':
      self.tss = self.end
      self.tes = self.start
    self.gtf_additional_info = ti_lines[5];


  def defineTSSAndTES(self):
    """
    define TSS and TES
    """
    if (self.tss is None) or (self.tes is None):
      if self.strand == '+':
        self.tss = self.start;
        self.tes = self.end;
      elif self.strand == '-':
        self.tss = self.end;
        self.tes = self.start;


  def calculateMappability(self, bin_bigwigsummary, fbigwig, width=500,
                           quiet=True):
    """
    calculate average mappability for a transcript's
    TSS region:  [TSS-width,   TSS+width],
    body region: [start+width+1, end-width-1],
    TES region:  [TES-width,   TES+width]

    if start+width+1 > end-width-1, then define body region as
      [end-width-1, start+width+1]

    assign the values for
    self.ave_mpp_around_TSS,  self.max_mpp_around_TSS
    self.ave_mpp_around_body, self.max_mpp_around_body
    self.ave_mpp_around_TES,  self.max_mpp_around_TES
    """
    import Util

    if (self.tss is None) or (self.tes is None):
      self.defineTSSAndTES()

    self.ave_mpp_around_TSS = Util.calculateMappability('mean', self.chrom,
                              self.tss - width, self.tss + width,
                              bin_bigwigsummary, fbigwig, quiet)

    if (self.start + width + 1) < (self.end - width - 1):
      self.ave_mpp_around_body = Util.calculateMappability('mean', self.chrom,
                                 self.start+width+1, self.end-width-1,
                                 bin_bigwigsummary, fbigwig, quiet)
    elif (self.start + width + 1) > (self.end - width - 1):
      self.ave_mpp_around_body = Util.calculateMappability('mean', self.chrom,
                                 self.end-width-1, self.start+width+1,
                                 bin_bigwigsummary, fbigwig, quiet)
    elif (self.start + width + 1) == (self.end - width - 1):
      self.ave_mpp_around_body = 1.0

    self.ave_mpp_around_TES = Util.calculateMappability('mean', self.chrom,
                              self.tes - width, self.tes + width,
                              bin_bigwigsummary, fbigwig, quiet)



def readRSEMTI(fin):
  """
  read RSEM's .ti file, return a list of Transcripts objects
  """
  import Util

  lines = Util.readFile(fin);
  (ntranscripts, foo) = lines[0].split();
  ntranscripts = int(ntranscripts);
  transcripts = [];
  for i in range(0, ntranscripts):
    tr = Transcript();
    tr.constructFromRSEMTI(lines[i*6+1:i*6+7]);
    transcripts.append(tr);
    if (i > 0) and (i % 20000 == 0):
      print("processed %d transcripts" % i);

  return transcripts;


def quicklyReadRSEMTI(fin):
  """
  read RSEM's .ti file without parsing the additional information line (the last
  line in a transcript's block

  return a list of Transcripts objects
  """
  import Util

  lines = Util.readFile(fin);
  (ntranscripts, foo) = lines[0].split();
  ntranscripts = int(ntranscripts);
  transcripts = [];
  for i in range(0, ntranscripts):
    tr = Transcript();
    tr.quicklyConstructFromRSEMTI(lines[i*6+1:i*6+7]);
    transcripts.append(tr);
    if (i > 0) and (i % 20000 == 0):
      print("processed %d transcripts" % i);

  return transcripts;