File: unitig.py

package info (click to toggle)
hinge 0.5.0-8
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,972 kB
  • sloc: cpp: 9,480; ansic: 8,826; python: 5,023; sh: 340; makefile: 10
file content (130 lines) | stat: -rwxr-xr-x 4,107 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/python3
import networkx as nx
import sys
import itertools

filename = sys.argv[1]
outfile = filename.split('.')[0] + ".edges.list"

g = nx.read_graphml(filename)
print(nx.info(g))


def get_circle(g,node,vertices_of_interest):
    cur_path = [node]
    cur_vertex = g.successors(node)[0]

    i = 0
    while cur_vertex != node:
        cur_path.append(cur_vertex)
        try:
            assert len(g.successors(cur_vertex)) == 1
        except:
            print(g.successors(cur_vertex), cur_vertex, node)
            print(cur_vertex in vertices_of_interest)
            raise
        successor = g.successors(cur_vertex)[0]
        cur_vertex = successor
        
    cur_path.append(cur_vertex)
    


    return cur_path

    
def get_unitigs(g):
    paths = []
    num_paths = 0
    node_set = set(g.nodes())

    
    vertices_of_interest = set([x for x in g if g.in_degree(x) != 1 or g.out_degree(x) != 1])
    vertices_used = set(vertices_of_interest)
    for start_vertex in vertices_of_interest:
        first_out_vertices = g.successors(start_vertex)
        print(first_out_vertices)
        for vertex in first_out_vertices:
            cur_path = [start_vertex]
            cur_vertex = vertex
            while cur_vertex not in vertices_of_interest:
                successor = g.successors(cur_vertex)[0]
                cur_path.append(cur_vertex)
                predecessor = cur_vertex
                cur_vertex = successor
                
            cur_path.append(cur_vertex)
            vertices_used = vertices_used.union(set(cur_path))
            paths.append(cur_path)

    print(len(node_set))
    print(len(vertices_used))

    while len(node_set-vertices_used) > 0:
        node = list(node_set-vertices_used)[0]
        # print list(node_set-vertices_used)
        # # print vertices_of_interest
        # # print len(node_set-vertices_used)
        # break
        path = get_circle(g, node, vertices_of_interest)
        vertices_used = vertices_used.union(set(path))
        if len(path) > 1:
            paths.append(path)
    print(len(paths)) 
    # print paths
    print("paths")
    return paths
        
        
paths = get_unitigs(g)

print(len(paths))

h = nx.DiGraph()
for i, path in enumerate(paths):
    h.add_node(i)
    h.node[i]['path'] = path

vertices_of_interest = set([x for x in g if g.in_degree(x) != 1 or g.out_degree(x) != 1])
for vertex in vertices_of_interest:
    successors = [x for x in h.nodes() if h.node[x]['path'][0] == vertex]
    predecessors = [x for x in h.nodes() if h.node[x]['path'][-1] == vertex]
    print(successors,predecessors)
    assert len(successors)==1 or len(predecessors)==1
    for succ, pred in itertools.product(successors,predecessors):
        h.add_edge(pred,succ)

    # if vertex.split('_') == '0':
    #     if len(predecessors) == 1:
    #         for succ in successors:
    #             rel_suc = h.node[succ]['path'][1]
    #             d = g.get_edge_data(vertex,rel_suc)
    #             h.edge[predecessors[0]][succ]['start_pos'] = d['read_a_end']
    #             h.edge[predecessors[0]][succ]['weight'] = d['read_a_end_raw'] - d['read_a_start']
    #     if len(successors) == 1:
    #         for pred in predecessors:
    #             rel_pred = h.node[pred]['path'][-2]
    #             d = g.get_edge_data(rel_pred,vertex)
    #             h.edge[predecessors[0]][succ][''] = d['read_a_end_raw'] - d['read_a_start']


with open(outfile, 'w') as f:
    for i,path in enumerate(paths):
        f.write('>Unitig%d\n'%(i))
        for j in range(len(path)-1):
            nodeA = path[j].lstrip("B")
            nodeB = path[j+1].lstrip("B")

            d =  g.get_edge_data(path[j],path[j+1])

            f.write('%s %s %s %s %d %d %d %d %d\n'%(nodeA.split('_')[0],nodeA.split('_')[1]  , nodeB.split('_')[0], 
                    nodeB.split('_')[1], -d['read_a_start_raw'] + d['read_a_end_raw'] - d['read_b_start_raw'] + d['read_b_end_raw'], 
                    d['read_a_start_raw'], d['read_a_end_raw'], d['read_b_start_raw'], d['read_b_end_raw']))



    f.close()