File: TNode.py

package info (click to toggle)
trinityrnaseq 2.11.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 417,528 kB
  • sloc: perl: 48,420; cpp: 17,749; java: 12,695; python: 3,124; sh: 1,030; ansic: 983; makefile: 688; xml: 62
file content (347 lines) | stat: -rwxr-xr-x 9,559 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#!/usr/bin/python3
# encoding: utf-8

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)

import os, sys, re
import logging
import argparse
import collections
import numpy
import time

import TGraph

logger = logging.getLogger(__name__)


class TNode:
    """
    generic Trinity graph node object representing a node in the Trinity isoform reconstruction graph

    Node's are objects within a gene and can be shared among transcript isoforms.

    instance members include:

        tgraph : (TGraph obj) graph for the Trinity gene, which will hold the nodes.

        transcripts: list(str) names of the isoforms that contains this node.

        loc_node_id : (int) identifier of the node

        seq : (str)  nucleotide sequence for this node in the transcript

        len : (int)  length of the node sequence

        prev : (set) node objects connected as parental nodes in the graph

        next : (set) node objects connected as descendant nodes in the graph

    class members include:

        merged_nodeset_counter : (int) tracking nodes that get merged under squeeze operations.
        
    """
    
    node_cache = dict()

    merged_nodeset_counter = 0

    all_nodes_counter = 0


    def __init__(self, tgraph, transcript_id, loc_node_id, node_seq):
        """
        constructor, but don't use directly.... instead, use TGraph.get_node() factory function
        """
        
        if len(node_seq) == 0:
            raise RuntimeError("Error, TNode instantiation requires node sequence of length > 0")

        self.tgraph = tgraph
        self.transcripts = set()
        self.add_transcripts(transcript_id)
        self.loc_node_id = loc_node_id
        self.seq = node_seq
        self.len = len(node_seq)

        TNode.all_nodes_counter += 1
        self._id = TNode.all_nodes_counter
        
        #logger.info("{}\t{}".format(loc_node_id, node_seq))

        self.prev = set()
        self.next = set()
        self.stashed_prev = set() # for manipulation during topological sorting
        self.stashed_next = set() 

        self.touched = 0  
        self.dead = False
        self.topological_order = -1  # updated on topological sorting

    #########################
    ## various Node ID values
    #########################


    def get_id(self):
        # a private unique identifier for all nodes
        return self._id
        
    def get_loc_id(self):
        return self.loc_node_id

    def set_loc_id(self, loc_node_id):
        self.loc_node_id = loc_node_id
        
    
    def get_gene_id(self):
        return self.tgraph.get_gene_id()

    def get_gene_node_id(self):

        gene_id = self.get_gene_id()
        loc_id = self.get_loc_id()

        node_id = gene_id + "::" + loc_id
                
        return node_id

    def get_touched_val(self):
        return self.touched

    def is_dead(self):
        return(self.dead)

    def is_ancestral(self, node, visited=None):

        if visited is None:
            visited = set() #init round
        
        #logger.debug("is_ancestral search from {} of node {}".format(self, node))
        if node == self:
            #logger.debug("node is self")
            return True
        
        if node in self.prev:
            #logger.debug("node in self.prev")
            return True
        else:
            #logger.debug("continuing search")
            visited.add(self)
            #logger.debug("visited: {}".format(visited))
            for prev_node in self.prev:
                #logger.debug("cascading towards prev_node: {}".format(prev_node))
                if prev_node in visited:
                    #logger.debug("prev_node in visited")
                    pass
                else:
                    #logger.debug("prev_node not in visited")
                    found = prev_node.is_ancestral(node, visited)
                    if found:
                        return True
        return False


    def is_descendant(self, node, visited=None):

        if visited == None:
            visited = set()  # init round
        
        if node == self:
            return True

        if node in self.next:
            return True
        else:
            visited.add(self)
            for next_node in self.next:
                if next_node not in visited:
                    found = next_node.is_descendant(node, visited)
                    if found:
                        return True
        return False
    
    ## Other accessors

    def get_graph(self):
        return self.tgraph

    def get_seq(self):
        return self.seq


    def set_seq(self, seq):
        self.seq = seq


    def get_topological_order(self):
        return self.topological_order

    def set_topological_order(self, topo_order):
        self.topological_order = topo_order
    
    def get_transcripts(self):
        return self.transcripts

    def add_transcripts(self, transcript_name_or_set):
        if type(transcript_name_or_set) is set:
            self.transcripts.update(transcript_name_or_set)
        elif type(transcript_name_or_set) is str:
            self.transcripts.add(transcript_name_or_set)
        else:
            raise RuntimeError("Error, parameter must be a string or a set ")
        
    def get_prev_nodes(self):
        return set(self.prev)

    def get_next_nodes(self):
        return set(self.next)
    
    def add_next_node(self, next_node_obj):
        self.next.add(next_node_obj)

    def remove_next_node(self, remove_node_obj):
        self.next.remove(remove_node_obj)

    def stash_next_node(self, stash_node_obj):
        self.remove_next_node(stash_node_obj)
        self.stashed_next.add(stash_node_obj)

    def add_prev_node(self, prev_node_obj):
        self.prev.add(prev_node_obj)

    def remove_prev_node(self, remove_node_obj):
        self.prev.remove(remove_node_obj)

    def stash_prev_node(self, stash_node_obj):
        self.remove_prev_node(stash_node_obj)
        self.stashed_prev.add(stash_node_obj)


    def restore_stashed_nodes(self):
        self.prev.update(self.stashed_prev)
        self.stashed_prev = set()

        self.next.update(self.stashed_next)
        self.stashed_next = set()

    def get_prev_node_loc_ids(self):
        loc_ids = list()
        for node in self.get_prev_nodes():
            loc_ids.append(node.get_loc_id())
        return loc_ids

    def get_next_node_loc_ids(self):
        loc_ids = list()
        for node in self.get_next_nodes():
            loc_ids.append(node.get_loc_id())
        return loc_ids
    
    def __repr__(self):
        return(self.loc_node_id)


    ## Touching nodes 
    def touch(self):
        self.touched += 1

    def untouch(self):
        self.touched -= 1

    def clear_touch(self):
        self.touched = 0


    def toString(self):
        txt = str("prev: " + str(self.get_prev_node_loc_ids()) +
                  ", me: " + str(self.get_loc_id()) +
                  ", next: " + str(self.get_next_node_loc_ids()) +
                  ", transcripts: " + str(self.transcripts) +
                  ", " + self.get_seq())

        if self.topological_order >= 0:
            txt += ", topo_order={}".format(self.topological_order)
        
        if self.dead:
            txt += " ** dead ** "
        
        return txt
    

    @classmethod
    def merge_nodes(cls, node_list):
        """
        Merges linear stretches of nodes into a single new node that has
        concatenated sequences of the input nodes
        """

        logger.debug("Merging nodes: {}".format(node_list))
        
        merged_node_seq = ""
        TNode.merged_nodeset_counter += 1
        merged_loc_node_id = "M{}".format(TNode.merged_nodeset_counter)

        # transcript list should be the intersection from nodes being merged (not the union)
        # because repeat nodes could be part of the merge.
        transcripts = node_list[0].get_transcripts()


        for node_obj in node_list:
            logger.debug("node being merge: {}".format(node_obj.toString()))
            seq = node_obj.get_seq()
            merged_node_seq += seq
            transcripts = transcripts.intersection(node_obj.get_transcripts())

        tgraph = node_list[0].get_graph()

        merged_node = TNode(tgraph, transcripts, merged_loc_node_id, merged_node_seq)
        
        return merged_node



    def is_burr(self):
        """
        returns true if node (x) is in this graphical context:

          X                               X
            \           or               /
         C-- A--?                   ?-- A--B

         where X dangles.


         So, X has only one parent or child and not otherwise connected in the graph.

         """

        if self.get_prev_nodes() and self.get_next_nodes():
            return False

        if len(self.get_prev_nodes()) > 1 or len(self.get_next_nodes()) > 1:
            return False

        # illustration above on left side
        if (len(self.get_next_nodes()) == 1
            and
            len(self.get_prev_nodes()) == 0
            and
            len(self.get_next_nodes().pop().get_prev_nodes()) > 1):

            return True

        # illustration above on right side
        if (len(self.get_next_nodes()) == 0
            and
            len(self.get_prev_nodes()) == 1
            and
            len(self.get_prev_nodes().pop().get_next_nodes()) > 1):

            return True

        # more complex structure
        return False