File: Compact_graph_pruner.py

package info (click to toggle)
trinityrnaseq 2.11.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 417,528 kB
  • sloc: perl: 48,420; cpp: 17,749; java: 12,695; python: 3,124; sh: 1,030; ansic: 983; makefile: 688; xml: 62
file content (139 lines) | stat: -rwxr-xr-x 4,127 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
# encoding: utf-8

import os, sys, re
import logging

import TGraph
import TNode
import TGLOBALS


logger = logging.getLogger(__name__)



class Compact_graph_pruner:

    def __init__(self):

        pass


    def remove_burrs(self, tgraph, max_burr_length):

        nodes = tgraph.get_all_nodes()

                
        for node in nodes:
            if node.is_burr() and len(node.get_seq()) <= max_burr_length:
                logger.debug("Compact_graph_pruner.remove_burrs - pruning burr node: {}".format(node))
                tgraph.prune_node(node)
    
    
    
    def pop_small_bubbles(self, tgraph, max_bubble_node_length):
        """
        bubble structure:

             X1
            /  \
        ---A    B--
            \  /
             X2


        keep X1 or X2, ideally based on which is in a more highly expressed isoform structure.

        A node in a bubble is defined as:
        - has single parent and single child
        - the parents of child(A) and the children of the parent(B) are identical. (simple bubble structure)
        
        """

        bubble_node_lists = self._get_bubbles(tgraph, max_bubble_node_length)

        for bubble_node_list in bubble_node_lists:
            # TODO: should sort by expression, select representative node based on highest expr
            repr_node = bubble_node_list.pop()
            for sister_bubble_node in bubble_node_list:
                repr_node.add_transcripts(sister_bubble_node.get_transcripts())
                tgraph.prune_node(sister_bubble_node)





    def _get_bubbles(self, tgraph, max_bubble_node_length):

        logger.debug("Compact_graph_pruner::_get_bubbles()")
        
        bubble_node_lists = list()

        node_found_in_bubble = dict()  # store T if in bubble

        for node in tgraph.get_all_nodes():

            
            if node in node_found_in_bubble:
                # already identified as within a bubble as seeded by a different node.
                continue

            if len(node.get_prev_nodes()) != 1:
                continue
            if len(node.get_next_nodes()) != 1:
                continue

            child_node_A = node.get_prev_nodes().pop()
            parent_node_B = node.get_next_nodes().pop()

            bubble_nodes = child_node_A.get_next_nodes() & parent_node_B.get_prev_nodes()
            if len(bubble_nodes) < 2:
                # not a bubble
                continue

            # potential bubble...  check to see that each has only one parent and child
            selected_bubble_nodes = list()
            for bubble_node in bubble_nodes:
                if len(bubble_node.get_prev_nodes()) != 1:
                    continue
                if len(bubble_node.get_next_nodes()) != 1:
                    continue
                if len(bubble_node.get_seq()) > max_bubble_node_length:
                    continue
                
                selected_bubble_nodes.append(bubble_node)

            if len(selected_bubble_nodes) < 2:
                # no simple bubble here
                continue

            # got a simple bubble.  store it.

            logger.debug("Found bubble: {} -- {} -- {}".format(child_node_A, selected_bubble_nodes, parent_node_B))


            seen_bubble_node_already = False
            for selected_bubble_node in selected_bubble_nodes:
                if selected_bubble_node in node_found_in_bubble:
                    seen_bubble_node_already = True

            if seen_bubble_node_already:
                # can happen if there's a larger bubble that contains a long sequence node as well as smaller bubble nodes.
                continue
            
            for selected_bubble_node in selected_bubble_nodes:
                node_found_in_bubble[selected_bubble_node] = True
                logger.debug("Compact_graph_pruner::_get_bubbles(): node {} identifed within a bubble".format(selected_bubble_node))

            bubble_node_lists.append(selected_bubble_nodes)

            
        return bubble_node_lists