1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
|
#!/usr/bin/env python3
# encoding: utf-8
import os, sys, re
import logging
import TGraph
import TNode
import TGLOBALS
logger = logging.getLogger(__name__)
class Compact_graph_pruner:
def __init__(self):
pass
def remove_burrs(self, tgraph, max_burr_length):
nodes = tgraph.get_all_nodes()
for node in nodes:
if node.is_burr() and len(node.get_seq()) <= max_burr_length:
logger.debug("Compact_graph_pruner.remove_burrs - pruning burr node: {}".format(node))
tgraph.prune_node(node)
def pop_small_bubbles(self, tgraph, max_bubble_node_length):
"""
bubble structure:
X1
/ \
---A B--
\ /
X2
keep X1 or X2, ideally based on which is in a more highly expressed isoform structure.
A node in a bubble is defined as:
- has single parent and single child
- the parents of child(A) and the children of the parent(B) are identical. (simple bubble structure)
"""
bubble_node_lists = self._get_bubbles(tgraph, max_bubble_node_length)
for bubble_node_list in bubble_node_lists:
# TODO: should sort by expression, select representative node based on highest expr
repr_node = bubble_node_list.pop()
for sister_bubble_node in bubble_node_list:
repr_node.add_transcripts(sister_bubble_node.get_transcripts())
tgraph.prune_node(sister_bubble_node)
def _get_bubbles(self, tgraph, max_bubble_node_length):
logger.debug("Compact_graph_pruner::_get_bubbles()")
bubble_node_lists = list()
node_found_in_bubble = dict() # store T if in bubble
for node in tgraph.get_all_nodes():
if node in node_found_in_bubble:
# already identified as within a bubble as seeded by a different node.
continue
if len(node.get_prev_nodes()) != 1:
continue
if len(node.get_next_nodes()) != 1:
continue
child_node_A = node.get_prev_nodes().pop()
parent_node_B = node.get_next_nodes().pop()
bubble_nodes = child_node_A.get_next_nodes() & parent_node_B.get_prev_nodes()
if len(bubble_nodes) < 2:
# not a bubble
continue
# potential bubble... check to see that each has only one parent and child
selected_bubble_nodes = list()
for bubble_node in bubble_nodes:
if len(bubble_node.get_prev_nodes()) != 1:
continue
if len(bubble_node.get_next_nodes()) != 1:
continue
if len(bubble_node.get_seq()) > max_bubble_node_length:
continue
selected_bubble_nodes.append(bubble_node)
if len(selected_bubble_nodes) < 2:
# no simple bubble here
continue
# got a simple bubble. store it.
logger.debug("Found bubble: {} -- {} -- {}".format(child_node_A, selected_bubble_nodes, parent_node_B))
seen_bubble_node_already = False
for selected_bubble_node in selected_bubble_nodes:
if selected_bubble_node in node_found_in_bubble:
seen_bubble_node_already = True
if seen_bubble_node_already:
# can happen if there's a larger bubble that contains a long sequence node as well as smaller bubble nodes.
continue
for selected_bubble_node in selected_bubble_nodes:
node_found_in_bubble[selected_bubble_node] = True
logger.debug("Compact_graph_pruner::_get_bubbles(): node {} identifed within a bubble".format(selected_bubble_node))
bubble_node_lists.append(selected_bubble_nodes)
return bubble_node_lists
|