File: K3000_gfa_post_treatment.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (126 lines) | stat: -rw-r--r-- 4,371 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import warnings # deprecated

import networkx as nx

import sys
# import getopt
import K3000_common as kc
# import sorted_list
import argparse

import negative_binomial as nb

def store_graph(gfa_file_name):
    DG = nx.DiGraph()
    gfa_file = open(gfa_file_name)

    while(True):
        gfa_line=gfa_file.readline()
        if not gfa_line: break
        if gfa_line[0]=='H': continue       #Header
        if gfa_line[0]=='S':                #node
            split_gfa_line=gfa_line.strip().split()
            # print("node", split_gfa_line)
            node_id = split_gfa_line[1]
            node_coverage = split_gfa_line[-1].split(':')[-1] #  RC:i:24 -> 24
            DG.add_node(node_id, coverage=node_coverage)
            # print("added", node_id, node_coverage)
        
        if gfa_line[0]=='L':                #edge
            split_gfa_line=gfa_line.strip().split()
            # print("edge", split_gfa_line)
            source = split_gfa_line[1]
            target = split_gfa_line[3]
            overlap = split_gfa_line[5]
            if overlap == "0M" or overlap == "-2M": continue   #pairend and forbiden link are not considered
            # if overlap == "-2M": continue   #forbiden link
            DG.add_edge(source, target,type=overlap)
    gfa_file.close()
    return DG
    
    
def remove_outsider_nodes_from_cc(DG,cc):
    """ for each CC compute the r,p values of the coverage distributions of the nodes
    Remove nodes that are too far from the mean distribution
    This creates new CC
    """
    return                                  # does nothing for now, waiting for amin.
    if len(cc)<8: return                    # TODO: parameter
    coverages = []
    for node_id in cc: 
        print(DG.nodes[node_id]['coverage'])
        coverages.append(int(DG.nodes[node_id]['coverage']))
    print("for",coverages)
    n,p=nb.neg_bin_fit(coverages)
    E=n/p
    V=n*(1-p)/(p*p)
    print("n",n,"p",p,"E",E,"V",V)
    #TODO remove outsider nodes from the graph
    
def assign_cc(DG,max_cc_size=sys.maxsize):
    #CC=list(nx.connected_components(nx.Graph(DG)))
    #for cc in CC:
    #    remove_outsider_nodes_from_cc(DG,cc)
    
    # recompute CC after having removed outsiders from original CCs
    # assign each node to its cc_id
    CC=list(nx.connected_components(nx.Graph(DG)))
    cc_id=0
    for cc in CC:
        cc_id +=1
        if len(cc) > max_cc_size:
            # print ("remove nodes from too large", len(cc) )
            for node_id in cc:
                DG.remove_node(node_id)
            continue
        for node_id in cc:
            DG.nodes[node_id]['cc_id'] = cc_id


def remove_cc_with_cycles(DG):
    # remove pairend links and unitig links (unoriented)
    edges_to_remove = []
    for edge in DG.edges.data():
        if edge[2]['type'] == '-1M':
            edges_to_remove.append(edge)
            
    for edge in edges_to_remove:
        DG.remove_edge(edge[0],edge[1])
    cycles = list(nx.simple_cycles(DG))
    # sys.stderr.write(f" removed {len(cycles)} cycles\n")    #DEB
    
    # tmpnb=0                                         #DEB
    G=nx.Graph(DG)
    for nodes in cycles:
        first_node_in_cycle = nodes[0]              # get the first node, the other ones in the cycle are in the same CC
        # remove the whole CC:
        CC_with_cycle = nx.node_connected_component(G, first_node_in_cycle)
        for node_id in CC_with_cycle:
            if node_id in DG.nodes():
                # tmpnb+=1                            #DEB
                DG.remove_node(node_id)
    # sys.stderr.write(f" removed {tmpnb} nodes\n")   #DEB

def main():
    '''
    Compaction of set of facts coded as set of ids of unitigs
    '''
    warnings.warn(f"{sys.argv[0]} is not maintenained anymore (May 2020).", DeprecationWarning)
    parser = argparse.ArgumentParser(description='Compaction of set of facts coded as set of ids of unitigs.')
    parser.add_argument("input_file", type=str,
                        help="gfa file " )


    max_cc_size = 1000
    args = parser.parse_args()
    gfa_file_name = str(args.input_file)
    DG = store_graph(gfa_file_name)
    assign_cc(DG,max_cc_size)
    # print(DG.nodes.data())
    remove_cc_with_cycles(DG)
    
    if '1' in DG.nodes(): 
        print("is in DG", DG.nodes['1']['cc_id'])
    
if __name__ == "__main__":
     main()