File: K3000_find_unitig_connected_pairs_of_facts.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (374 lines) | stat: -rw-r--r-- 20,810 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
import sys
import K3000_common as kc
def update_closests(closests, source, target, distance):
    # the previously stored distance from previous to current is bigger, replace it: 
    if source in closests: 
        if distance < closests[source][1]:
            closests[source] = (target, distance)
    # the variant "source" had no successor: 
    else: # source not in closest
        closests[source] = (target, distance)


def store_closest_variant_id(gfa_file_name):
    """
    Given a gfa file, store for each variant id (eg 12) the closest next variant (eg -42). In this case: 
    closests[12]=-42 and 
    closests[42]=[-12]


    The distance is computed from the BP field
    
    S       1       -577h;-977l;1354l;      SP:0_44;11_64;32_75;    BP:0_41;-26_41;-25_41;  EV:0    FC:i:52 min:17  max:410 mean:180.0      AC:410;17;113;


    closests[-577]=-977
    closests[-977]=1354
    closests[-1354]=977
    closests[977]=577

    returns closests, that actualy contains alsoe the relative distance eg:
    closests[-577]=(-977,-26)
    closests[-977]=(1354,-25)
    closests[-1354]=(977,-25)
    closests[977]=(577,-26)
    .
    """
    closests={}                          # key = variant id, value = couple (variant id, distance (from SP))
    with open(gfa_file_name) as gfa_file:
        for line in gfa_file.readlines():
            if line[0]!='S': continue
            line = line.split()
            ids = line[2].strip(';').split(';')
            distances = line[4].strip(';').split(':')[1].split(';')
            previous = int(ids[0][:-1])                         # -577
            for i in range(1,len(ids)):
                ## previous to current
                current = int(ids[i][:-1])                      # -977 with i=1
                distance = int(distances[i].split('_')[0])      # -26 with i=1
                ## Forward order: previous to current ##
                update_closests(closests, previous, current, distance) 
                ## reverse order: -current to -previous ##
                update_closests(closests, -current, -previous, distance) 
                previous = current
    return closests
    


            





def store_fact_extreme_snp_ids(gfa_file_name):
    """ 
    Given a gfa file, store the id of each extreme SNP (without h or l)
    """
    mfile = open(gfa_file_name)
    leftmost_snp_to_fact_id={}   # each SNP id (with order and without h or l) is linked to some compacted facts id
    rightmost_snp_to_fact_id={}   # each SNP id (with order and without h or l) is linked to some compacted facts id
                        # eg: '-1015': {8, 5, 6, 7}

    for line in mfile: 
        
        #                                                    S       10      24617l;10033l;-11833l;  RC:i:55
        if line[0]!="S": continue # not a compacted fact
        line=line.strip().split()
        compactedfact_id = int(line[1])
        
        #left
        leftmost_snp_id=int(line[2].strip().split(';')[0][:-1]) # eg. 24617
        if leftmost_snp_id not in leftmost_snp_to_fact_id: leftmost_snp_to_fact_id[leftmost_snp_id] = set()
        leftmost_snp_to_fact_id[leftmost_snp_id].add(compactedfact_id)
        if -leftmost_snp_id not in rightmost_snp_to_fact_id:  rightmost_snp_to_fact_id[-leftmost_snp_id] = set()
        rightmost_snp_to_fact_id[-leftmost_snp_id].add(-compactedfact_id)
        
        #right
        rightmost_snp_id=int(line[2].strip(';').split(';')[-1][:-1])# -1 is empty
        if rightmost_snp_id not in rightmost_snp_to_fact_id: rightmost_snp_to_fact_id[rightmost_snp_id] = set()
        rightmost_snp_to_fact_id[rightmost_snp_id].add(compactedfact_id)
        if -rightmost_snp_id not in leftmost_snp_to_fact_id: leftmost_snp_to_fact_id[-rightmost_snp_id] = set()
        leftmost_snp_to_fact_id[-rightmost_snp_id].add(-compactedfact_id)
    
    mfile.close()
    return leftmost_snp_to_fact_id, rightmost_snp_to_fact_id
    

def get_uppercase_sequence(sequence):
    """given a sequence lBr with l and r in acgt and B in ACGT, return B"""
    res=""
    for l in sequence:
        if l>='A' and l<='Z':
            res+=l
    return res
 

def store_remarkable_kmers(fa_file_name, k, leftmost_snp_to_fact_id, rightmost_snp_to_fact_id, LOs = {}, LIs = {}, RIs = {}, ROs = {}):
    """
    Given a disco output, store for SNPs the remarkable (k-1)mers.
    
    Remarkable (k-1)mers are
    LO (Left Out) Leftmost (k-1)mer on the left unitig
    LI (Left In) Leftmost (k-1)mer not on the SNP (corresponds to the first k-1 upper case characters)
    RI (Right In) Rightmost (k-1)mer not on the SNP (corresponds to the last k-1 upper case characters)
    RO (Right Out) Rightmost (k-1)mer on the right unitig
    
    We store only LO and LI for SNPs that are a left most SNP of at least a fact
    We store only RO and RI for SNPs that are a right most SNP of at least a fact
    """
    
    # LOs = {} # key: (k-1)mer, value: set of fact ids
    # LIs = {} # key: (k-1)mer, value: set of fact ids
    # RIs = {} # key: (k-1)mer, value: set of fact ids
    # ROs = {} # key: (k-1)mer, value: set of fact ids
    
    mfile = open(fa_file_name)
    while True:
        line1 = mfile.readline()
        if not line1: break
        line1 = line1.strip()               #>SNP_higher_path_9995|P_1:30_A/C|high|nb_pol_1|left_unitig_length_1|right_unitig_length_93|C1_39|Q1_63|G1_0/1:494,15,574|rank_0
        line2 = mfile.readline().strip()    #tGCGCGTCTCCGGCCTGAAAAAGCTGTCCGTAAACGGTACAGATAGCAATCCCCAATCGGGAaccgtctactgtagccagcgccggaatataatccgcgactttaccctgaccaatgagcggccgcacttgccgcaagatgttttctaaaattgc
        mfile.readline().strip()            #>SNP_lower_path_9995|P_1:30_A/C|high|nb_pol_1|left_unitig_length_1|right_unitig_length_93|C1_35|Q1_63|G1_0/1:494,15,574|rank_0                                 no need to store
        mfile.readline().strip()            #tGCGCGTCTCCGGCCTGAAAAAGCTGTCCGTCAACGGTACAGATAGCAATCCCCAATCGGGAaccgtctactgtagccagcgccggaatataatccgcgactttaccctgaccaatgagcggccgcacttgccgcaagatgttttctaaaattgc    no need to store
        
        if not line1.startswith(">SNP"): continue #not a SNP
        
        snp_id = int(line1.split("|")[0].split('_')[-1])
        
        
        ### FORWARD CASES ###
        # This SNP (forward) is the starting of at least one compacted fact. Thus we store its remakable left (k-1)mers and we associate them to the corresponding facts
        if snp_id in leftmost_snp_to_fact_id:
            # stored = True
            LO = line2[:k-1].upper()            # get the first (k-1)mer 
            central_sequence = get_uppercase_sequence(line2)
            LI = central_sequence[:k-1]
            # association (k-1)mer -> facts
            if LO not in LOs: LOs[LO] = set()
            LOs[LO] = LOs[LO].union(leftmost_snp_to_fact_id[snp_id])
            if LI not in LIs: LIs[LI] = set()
            LIs[LI] = LIs[LI].union(leftmost_snp_to_fact_id[snp_id])
            # print("heyL",str(snp_id)," hoL ", leftmost_snp_to_fact_id[snp_id])
            
            
        
        # This SNP (forward) is the ending of at least one compacted fact. Thus we store its remakable right (k-1)mers and we associate them to the corresponding facts
        if snp_id in rightmost_snp_to_fact_id:
            # stored = True
            RO = line2[-k+1:].upper()           # get the last (k-1)mer
            central_sequence = get_uppercase_sequence(line2)
            RI = central_sequence[-k+1:]
            # association (k-1)mer -> facts
            if RO not in ROs: ROs[RO] = set()
            ROs[RO] = ROs[RO].union(rightmost_snp_to_fact_id[snp_id])
            if RI not in RIs: RIs[RI] = set()
            RIs[RI] = RIs[RI].union(rightmost_snp_to_fact_id[snp_id])
            # print("heyR",str(snp_id)," hoR ", rightmost_snp_to_fact_id[snp_id])
            
        ### REVERSE CASES ###
        # In this case a fact starts by the reverse of a SNP, eg, -10321. In this case it is sufficient to reverse complement the sequence of the SNP and to have exactly the same treatment as in the forward case. 
        # This SNP (reverse) is the starting of at least one compacted fact. Thus we store its remakable left (k-1)mers and we associate them to the corresponding facts
        if -snp_id in leftmost_snp_to_fact_id:
            # stored = True
            line2 = kc.get_reverse_complement(line2)
            LO = line2[:k-1].upper()            # get the first (k-1)mer 
            central_sequence = get_uppercase_sequence(line2)
            LI = central_sequence[:k-1]
            # association (k-1)mer -> facts
            if LO not in LOs: LOs[LO] = set()
            LOs[LO] = LOs[LO].union(leftmost_snp_to_fact_id[-snp_id])
            if LI not in LIs: LIs[LI] = set()
            LIs[LI] = LIs[LI].union(leftmost_snp_to_fact_id[-snp_id])
            # print("heyL",str(-snp_id)," hoL ", leftmost_snp_to_fact_id[-snp_id])
            
            
        
        # This SNP (reverse) is the ending of at least one compacted fact. Thus we store its remakable right (k-1)mers and we associate them to the corresponding facts
        if -snp_id in rightmost_snp_to_fact_id:
            # stored = True
            line2 = kc.get_reverse_complement(line2)
            RO = line2[-k+1:].upper()           # get the last (k-1)mer
            central_sequence = get_uppercase_sequence(line2)
            RI = central_sequence[-k+1:]
            # association (k-1)mer -> facts
            if RO not in ROs: ROs[RO] = set()
            ROs[RO] = ROs[RO].union(rightmost_snp_to_fact_id[-snp_id])
            if RI not in RIs: RIs[RI] = set()
            RIs[RI] = RIs[RI].union(rightmost_snp_to_fact_id[-snp_id])
       
       
    mfile.close()
    return LOs, LIs, RIs, ROs

def add_canonical(canonical_links, left_fact_id, right_fact_id):
    if abs(left_fact_id) <= abs(right_fact_id): # prints only canonical 
        if left_fact_id not in canonical_links: canonical_links[left_fact_id] = set()
        canonical_links[left_fact_id].add(right_fact_id)
    else:
        if -right_fact_id not in canonical_links: canonical_links[-right_fact_id] = set()
        canonical_links[-right_fact_id].add(-left_fact_id)


def store_sequence_link_facts(LOs, LIs, RIs, ROs, k, rightmost_snp_to_fact_id, fa_file_name, canonical_links = {}):
    """ given the association (k-1)mers -> leftfacts, we may derive the links between facts
    1. RIA == LOB and ROA == LIB           -> A+ -> B+
    2. RIA == rc(ROB) and ROA == rc(RIB)   -> A+ -> B-
    3. LOA == rc(LIB) and LIA == rc(LOB)   -> A- -> B+
    4. LOA == RIB and LIA == ROB           -> A- -> B- (eq B+ -> A+)
    
    In this function we traverse again all variants, 
     for those that are the END of a fact : 
        check if their RI and RO may lead to one of the previous link
     for those whose reverse complement is the END of a fact : 
        Reverse the sequence and then check if their RI and RO may lead to one of the previous link

    Case co-mapped:
    Finaly, two successive facts can exist if their extreme variants (eg r and l) were seen at least 
    once co-mapped and if there exists no other variant (eg m) between r, and l. 
    This information is stored in thie closests dictionary
    """
    
    mfile = open(fa_file_name)
    while True:
        line1 = mfile.readline()
        if not line1: break
        line1 = line1.strip()               #>SNP_higher_path_9995|P_1:30_A/C|high|nb_pol_1|left_unitig_length_1|right_unitig_length_93|C1_39|Q1_63|G1_0/1:494,15,574|rank_0
        line2 = mfile.readline().strip()    #tGCGCGTCTCCGGCCTGAAAAAGCTGTCCGTAAACGGTACAGATAGCAATCCCCAATCGGGAaccgtctactgtagccagcgccggaatataatccgcgactttaccctgaccaatgagcggccgcacttgccgcaagatgttttctaaaattgc
        mfile.readline().strip()            #>SNP_lower_path_9995|P_1:30_A/C|high|nb_pol_1|left_unitig_length_1|right_unitig_length_93|C1_35|Q1_63|G1_0/1:494,15,574|rank_0                                 no need to store
        mfile.readline().strip()            #tGCGCGTCTCCGGCCTGAAAAAGCTGTCCGTCAACGGTACAGATAGCAATCCCCAATCGGGAaccgtctactgtagccagcgccggaatataatccgcgactttaccctgaccaatgagcggccgcacttgccgcaagatgttttctaaaattgc    no need to store
        
        if not line1.startswith(">SNP"): continue #not a SNP
        
        snp_id = int(line1.split("|")[0].split('_')[-1])
        
        # This SNP (forward) is the ending of at least one compacted fact. Thus we store its remakable right (k-1)mers and we associate them to the corresponding facts
        if snp_id in rightmost_snp_to_fact_id:
            ROA = line2[-k+1:].upper()           # get the last (k-1)mer
            central_sequence = get_uppercase_sequence(line2)
            RIA = central_sequence[-k+1:]
            
            # CASE 1.
            if RIA in LOs and ROA in LIs:
                set_of_right_facts_compatible = LOs[RIA].intersection(LIs[ROA])                                                  # select all facts ids both whose LO == RI and LI == RO                 -> A+ -> B+
                for right_fact_id in set_of_right_facts_compatible:
                    for left_fact_id in rightmost_snp_to_fact_id[snp_id]:
                        add_canonical(canonical_links, left_fact_id, right_fact_id)
                        # print (f"L\t{abs(left_fact_id)}\t{sign(left_fact_id)}\t{abs(right_fact_id)}\t{sign(right_fact_id)}\t-1Ma")
                        # print ("L\t"+str(left_fact_id)+"\t+\t"+str(right_fact_id)+"\t+\t-1Ma")                          # -1 enables to detect those links

            # CASE 2.
            if kc.get_reverse_complement(RIA) in ROs and kc.get_reverse_complement(ROA) in RIs:
                set_of_right_facts_compatible = ROs[kc.get_reverse_complement(RIA)].intersection(RIs[kc.get_reverse_complement(ROA)])  # select all facts ids both whose RIA == rc(ROB) and ROA == rc(RIB)     -> A+ -> B-
                for right_fact_id in set_of_right_facts_compatible:
                    for left_fact_id in rightmost_snp_to_fact_id[snp_id]:
                        add_canonical(canonical_links, left_fact_id, -right_fact_id)
                        # print (f"L\t{abs(left_fact_id)}\t{sign(left_fact_id)}\t{abs(right_fact_id)}\t{revsign(right_fact_id)}\t-1Ma")
                        
                        # print ("L\t"+str(left_fact_id)+"\t+\t"+str(right_fact_id)+"\t-\t-1Mb")                          # -1 enables to detect those links
                    
            # CASE 3. & 4. -> they are symetrical:
            # Cases 1. & 2. : detects A -> B and A -> B_, the other cases are
            # CASE 3. A_ -> B  will be detected when traversing B, detecting then A_ (Case 2.)
            # CASE 4. B -> A   will be detected when traversing Bn detecting then A  (Case 1.)

    mfile.close()
    return canonical_links




def store_link_facts_from_comapped(leftmost_snp_to_fact_id, rightmost_snp_to_fact_id, closests, canonical_links):
    """
    given a set of rightmost snp to fact ids and leftmost snp to fact id, and the closests variants dictionary:
    add links between facts
    """

    # is case variants a b were found with distance d
    # and latter a' b were foun with distance d' < d, we did not deduced a a' with distance d-d'
    # hence some links are lost. 
    # To avoid non-symetrical prints we store all canonical links and then we print them

    for source, target in closests.items():
        if target[1]<0: continue
        if source in rightmost_snp_to_fact_id and target[0] in leftmost_snp_to_fact_id:
            for left_fact_id in rightmost_snp_to_fact_id[source]:
                for right_fact_id in leftmost_snp_to_fact_id[target[0]]:
                    add_canonical(canonical_links, left_fact_id, right_fact_id)
    return canonical_links


sign = lambda s: "+" if s >=0 else "-"
revsign = lambda s: "-" if s >=0 else "+"  
def print_canonical_successive_links(canonical_links):
    for source, targets in canonical_links.items():
        for target in targets: 
            print(f"L\t{abs(source)}\t{sign(source)}\t{abs(target)}\t{sign(target)}\t-1M")


def print_original_gfa(gfa_file_name):
    mfile = open(gfa_file_name)
    print ("H\t#################")
    print ("H\t# GFA of variants")
    print ("H\t#################")
    print ("H\t# Nodes are (compacted) facts with their read mapping coverage. Eg. \"S       1       -577h;-977l;1354l;      SP:0_44;11_64;32_75;    BP:0_41;-26_41;-25_41;  FC:i:52 min:17  max:410 mean:180.0      AC:17;410;113;\".")
    print ("H\t#  * field SP stands for \"Sequence Position\". A bit useless in this file. It indicates for each allele of the fact its starting and ending position in the ACGT sequence, not shown here")
    print ("H\t#  * field BP stands for \"Bubble Position\". For each allele of the fact it indicates:")
    print ("H\t#     - first: the relative position of first nucleotide of the bubble with respect to the position of the last nucleotide of the bubble of the previous allele. This value is equal to zero for the first allele")
    print ("H\t#     - second: the length of the bubble of the allele") 
    print ("H\t#  * field EV strands for \"Extreme Variant\". Facts having at least one variant with no input or no output edge are considered as facts having an extreme variants. Their value is set to EV:1. Else, the value is set to EV:0")
    print ("H\t#  * field FC is the coverage of the fact, as provided by the total number of reads that phased at least two alleles of the fact")
    print ("H\t#  * fields min, max, and mean stand resp. for the min, max and mean of the read coverage of all alleles")
    print ("H\t#  * field AC stands for \"Allele Coverage\". The number of reads that map each allele is given in the same order as the variant ids (eg. \"17;410;113;\" are respectively, the coverages of variants \"-577h;-977l;1354l\")")
    print ("H\t# Four types of edges:")
    print ("H\t#   1. Overlap between facts. These links have an overlap length >0. Eg, \"L	1	-	29384	+	3M\", with:")
    print ("H\t#       \"S	1	10011l;23229h;-21935l;-8929l;-24397l;10011h;	RC:i:24\", and")
    print ("H\t#       \"S	29384	21935l;-23229h;-10011l;24397l;-23229l;-25549h;-10011h;	RC:i:43\".")
    print ("H\t#   2. Facts linked by paired end reads.  Eg \"L	10735	+	29384	+	0M	FC:i:5\".")
    print ("H\t#       The coverage indicates the number of pairend read linking the two facts")
    print ("H\t#       These links have an overlap of length 0.")
    print ("H\t#   3. Facts linked by unitigs. The unitig finishing a fact overlaps the unitig starting another fact. Eg \"L	19946	+	11433	+	-1M\".")
    print ("H\t#       These links are directed and validate the facts orientation. ")
    print ("H\t#       These links have an overlap of length -1.")
    print ("H\t#   4. Facts sharing at least one SNP identifier.")
    print ("H\t#       These links have an overlap of length -2.")
    
    
    
    for l in mfile: 
        print(l,end='')
    mfile.close()
    



def main (gfa_file_name, co_fa_file_name, unco_fa_file_name):

    sys.stderr.write("#Print original gfa\n")
    print_original_gfa(gfa_file_name)


    sys.stderr.write("#Store extreme left and right variant id for each fact\n")
    leftmost_snp_to_fact_id, rightmost_snp_to_fact_id   = store_fact_extreme_snp_ids(gfa_file_name)
    k                                                   = kc.determine_k(co_fa_file_name)
    sys.stderr.write("#Store remarkable kmers for each such variant\n")
    LOs, LIs, RIs, ROs                                  = store_remarkable_kmers(co_fa_file_name, k,leftmost_snp_to_fact_id, rightmost_snp_to_fact_id)
    LOs, LIs, RIs, ROs                                  = store_remarkable_kmers(unco_fa_file_name, k,leftmost_snp_to_fact_id, rightmost_snp_to_fact_id, LOs, LIs, RIs, ROs)

    sys.stderr.write("#Compute successive links obtained from kmers\n")
    canonical_links                                     = store_sequence_link_facts(LOs, LIs, RIs, ROs, k, rightmost_snp_to_fact_id, co_fa_file_name)
    canonical_links                                     = store_sequence_link_facts(LOs, LIs, RIs, ROs, k, rightmost_snp_to_fact_id, unco_fa_file_name, canonical_links)
    del LOs, LIs, RIs, ROs

    sys.stderr.write("#Store distances between co-mapped variants\n")
    closests                                            = store_closest_variant_id(gfa_file_name)
   
    sys.stderr.write("#Compute successive links obtained from co-mapped variants\n")
    canonical_links                                     = store_link_facts_from_comapped(leftmost_snp_to_fact_id, rightmost_snp_to_fact_id, closests, canonical_links)

    sys.stderr.write(f"#Write successive links\n")
    print_canonical_successive_links(canonical_links)

    
if __name__ == "__main__":
    main(sys.argv[1], sys.argv[2], sys.argv[3])