File: K3000_facts_to_gfa.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (191 lines) | stat: -rwxr-xr-x 8,530 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
'''
Creation of a GFA file from a set of compacted maximal facts
'''

__author__ = "Pierre Peterlongo"
__email__ = "pierre.peterlongo@inria.fr"


import sys
import K3000_common as kc


def show_right_edges (facts,x,id_x):
    ''' Main function. For a given fact x, we find y that overlap x, and we print the links in a GFA style:
    L	11	+	12	-	overlap size
    Note that one treat x only if its canonical.
    Four cases :
    1/ x overlaps y, with y canonical. One prints x + y + blabla
    2/ x overlaps y_, with y_ non canonical. In this case, y_ overlaps x. One of the two solutions has to be chosen. We chose min(idx,idy) (with idx,idy being the ids of the facts x,y in SR) One searches the id of y, and one prints x + y - blabla.
    3/ x_ overlaps y. same as 2.
    4/ x_ overlaps y_. We do nothing, this case is treated when the entry of the function is y that thus overlaps x.
    WARNING: here x and each fact in facts contain as last value its unique id.
    '''
    x=x[:-1]                                # remove the x_id from the x fact
    if not kc.is_canonical(x): return
    n=len(x)

    # CASES 1 AND 2
    strandx='+'
    # print ("x is", x)
    for len_u in range(1,n): # for each possible x suffix
        u=x[-len_u:]
        # print ("u is", u)
        Y=facts.get_lists_starting_with_given_prefix(u)
        # if x in Y: Y.remove(x)            # we remove x itself from the list of y : note that it should not occur.
        # print (x)
        # assert(x not in Y)
        if len(Y)==0: continue              # No y starting with u
        for y in Y:
            # detect the y strand
            # CASE 1/
            if kc.is_canonical(y[:-1]):     # remove the last value that corresponds to the node id
                strandy ='+'
                # id_y=indexed_nodes.index(y)                               # get the id of the target node
                id_y=kc.get_fact_id(y)                                           # last value is the node id, here the id of the target node
            # CASE 2/
            else:
                strandy='-'
#                id_y = indexed_nodes.index(kc.get_reverse_fact(y))
                id_y=kc.get_reverse_fact_id(y,facts)                                # find the reverse of list y in facts to grab its id.
                # assert kc.is_canonical(full_y[:-1]), "not canonical "+str(full_y[-1])+" full = "+str(full_y)
                # print("y is ", y)
                # print("id_y is", id_y)
                if id_x>id_y: continue # x_.y is the same as y_.x. Thus we chose one of them. By convention, we print x_.y if x<y.
            # print the edges
            print ("L\t"+str(id_x)+"\t"+strandx+"\t"+str(id_y)+"\t"+strandy+"\t"+str(len_u)+"M")


    # CASES 3 AND 4
    strandx='-'
    x_=kc.get_reverse_fact(x)
    for len_u in range(1,n): # for each possible x suffix
        u=x_[-len_u:]
        Y=facts.get_lists_starting_with_given_prefix(u)
        # assert(x_ not in Y)
        if len(Y)==0: continue  # No y starting with u
        for y in Y:
            if kc.is_canonical(y[:-1]): # CASE 3
                strandy ='+'
               # id_y=indexed_nodes.index(y)                                # get the id of the target node
                id_y=kc.get_fact_id(y)                                           # last value is the node id, here the id of the target node
                # we determine min(id_x,id_y)
                if id_x>id_y: continue # x_.y is the same as y_.x. Thus we chose one of them. By convention, we print x_.y if x<y.
                print ("L\t"+str(id_x)+"\t"+strandx+"\t"+str(id_y)+"\t"+strandy+"\t"+str(len_u)+"M") # note that strand x is always '-' and strandy is always '+' in this case.

#            else: continue # CASE 4, nothing to do.




def print_GFA_edges(facts):
    '''print each potiential edge in GFA format. Note that each edge is printed in one unique direction, the other is implicit
    WARNING: here each fact in facts contains as last value its unique id.
    '''
    for fact in facts.traverse():
        x_id = kc.get_fact_id(fact)                                         # last value is the node id
        if x_id%100==0: sys.stderr.write("\t%.2f"%(100*x_id/len(facts))+"%\r")
        show_right_edges(facts,fact,x_id)
    sys.stderr.write("\t100.00%\n")


def check_fact(fact, fact_int):
    # fact ['49648_0', '67994_-20', '20000_23']
    # fact_int 49648_0;67994_-20;20000_23; SP:0_166;126_261;178_444; BP:0_83;-20_72;23_61;
    for i,allele_id in enumerate(fact_int.split()[0].split(";")[:-1]):
        if fact[i] != allele_id:
            sys.stderr.write("Not corresponding fact and fact_int:\n")
            sys.stderr.write(str(fact)+"\n")
            sys.stderr.write(fact_int+"\n")
            sys.exit(0)
            
def index_nodeid_to_distance(facts, compacted_fact_int_file_name): 
    """ returns an index nodeid -> distances
    """
    compacted_fact_int_file=open(compacted_fact_int_file_name)
    nodeid_to_distance = {}
    for fact_line in compacted_fact_int_file.readlines():
        # 49648_0;67994_-20;20000_23; SP:0_166;126_261;178_444; BP:0_83;-20_72;23_61;
        s_fact_line = fact_line.strip().split()
        node_as_list = [node for node in s_fact_line[0].split(";")[:-1]]
        # print(s_fact_line[0], node_as_list)
        if not  kc.is_canonical(node_as_list):                       continue
        node_id = facts.get_node_id(node_as_list)
        assert node_id not in nodeid_to_distance, "node "+str(node_id)+" already in truc, with value "+ nodeid_to_distance[node_id]
        nodeid_to_distance[node_id]=s_fact_line[1]+"\t"+s_fact_line[2]
    compacted_fact_int_file.close() 
    return nodeid_to_distance
    

def contains_extreme_variant(fact, has_input_edge, has_output_edge):
    '''
    Returns True if a fact contains at least one extreme variant
    Else returns false
    '''
    for variant_id in kc.allele_values(fact):
        if variant_id not in has_input_edge or variant_id not in has_output_edge:
            return True
    return False
    
def print_GFA_nodes_as_ids(facts, compacted_fact_int_file_name, has_input_edge, has_output_edge):
    '''print canonical unitigs ids
    WARNING: here each fact in facts contains as last value its unique id.
    '''
    nodeid_to_distance = index_nodeid_to_distance(facts, compacted_fact_int_file_name)
    # compacted_fact_int_file=open(compacted_fact_int_file_name)
    for fact in facts.traverse():
        # fact_int=compacted_fact_int_file.readline().strip() # 49648_0;67994_-20;20000_23; SP:0_166;126_261;178_444; BP:0_83;-20_72;23_61;
        node_id = kc.get_fact_id(fact)                        # last value is the node id
        fact = fact[:-1]                                      # remove the last value that corresponds to the node id
        if not kc.is_canonical(fact):                       continue
        print ("S\t"+str(node_id)+"\t", end="")
        for unitig_id in fact:                       
            print (kc.unitig_id2snp_id(kc.allele_value(unitig_id))+";", end="")
        # check_fact(fact, fact_int)
        # assert str(node_id) in nodeid_to_distance, nodeid_to_distance
        print ("\t"+nodeid_to_distance[str(node_id)], end="")
        if contains_extreme_variant(fact, has_input_edge, has_output_edge):
            print("\tEV:1")
        else:
            print("\tEV:0")

def union(a, b):
    """ return the union of two lists """
    return list(set(a) | set(b))


def check(facts): 
    """ debug purpose """ 
    for fact in facts.traverse(): 
        assert kc.get_reverse_fact_id(fact,facts) != None, fact
   
    
def main():
    '''
    Creation of a GFA file from a set of facts
    For eahc fact: indicates if it contains at least an extreme variant
    Field EV
     * EV:1 if it contains at least an extreme variant
     * EV:0 else
    '''
    # SR=[[1,3,4],[14],[4,6],[-6,-1,-2],[4,6,7,9],[9,10,11,12],[4,6],[-13, -12, -11]]
    
    facts=kc.generate_facts(sys.argv[1])


    kc.add_reverse_facts(facts)
    has_input_edge, has_output_edge = kc.detect_input_output_edges(facts)
    facts.sort()
    facts.index_nodes()                          # This adds a final value to each sr, providing its node id.
    # check(facts)
    sys.stderr.write("Print GFA Nodes\n")
    print_GFA_nodes_as_ids(facts, sys.argv[1], has_input_edge, has_output_edge)
    sys.stderr.write("Print GFA Edges\n")
    print_GFA_edges(facts)


if __name__ == "__main__":
     main()