File: K3000_enhance_gfa.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (368 lines) | stat: -rw-r--r-- 19,054 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
import sys

def get_left_clean_snp(snp):
    return snp.lstrip().lstrip('-')


def set_indexes_from_gfa(gfa_file_name):
    mfile = open(gfa_file_name)
    snp_to_fact_id={}   # each SNP id (without order nor h or l) is linked to some compacted facts id
                        # eg: '1015': {8, 5, 6, 7}
    facts={}            # each compacted fact is accessible by a unique id. We store only the set of SNPs ids (not their orientering or their intra distance)
                        # eg:  5: {' 5: {'1015h', '827h'}', '827h'}
    for line in mfile: 
        
        #                                                    S       0       -5001_l;-8805_h;-12869_h;-25834_l;-47306_l;38133_l;
        if line[0]!="S": continue # not a compacted fact
        line=line.strip().split()
        compactedfact_id = line[1]
        facts[compactedfact_id]=[] # stored as a set to conserve the variants order, needed for computing AC.
        for value in line[2].strip().split(';')[:-1]:
            facts[compactedfact_id].append(value)
            snp_id=get_left_clean_snp(value) # from ' -587h' to '587h'
            if snp_id[:-1] not in snp_to_fact_id:
                snp_to_fact_id[snp_id[:-1]]=set()
            snp_to_fact_id[snp_id[:-1]].add(compactedfact_id)
    mfile.close()
    return facts, snp_to_fact_id
    

query_sign = lambda s : "-" if s[0]=="-" else "+"   

def compatibles(raw_fact,compacted_fact):
    """"given a raw fact, detects if it is totaly included into the compacted_fact[i]
    we already know that the 2 facts share at least a snp
    This methods detects if there exists or not a snp, with distinct alleles in the two facts (eg 1000h in one fact, 1000l in the other). 
    Returns false in this case, True else. 
    When True is returned, a second boolean value is returned. 
    It is set to True of the raw fact and the compacted_fact are in the same orientation, else it is set to false
    """

    ## index query facts:
    query_fact_to_hl = {}
    query_fact_to_pm = {}
    for variant in raw_fact:
        id_only = variant.lstrip('-')[:-1]
        query_fact_to_pm[id_only] = query_sign(variant)
        query_fact_to_hl[id_only] = variant[-1]

    ## checks that all variants from the raw fact are included in the compacted_fact: 
    for variant in raw_fact:
        if variant not in compacted_fact and variant.lstrip('-') not in compacted_fact and '-'+variant not in compacted_fact : 
            return False, None

    ## check compacted fact vs query fact
    same_direction  = False
    direction_set   = False
    for variant in compacted_fact:
        id_only = variant.lstrip('-')[:-1]
        if id_only not in query_fact_to_hl: continue
        sign = query_sign(variant)
        if variant[-1] != query_fact_to_hl[id_only]: return False, None
        # check sign
        if not direction_set: 
            direction_set = True
            same_direction =  (sign == query_fact_to_pm[id_only])
        else:
            if same_direction and sign != query_fact_to_pm[id_only]: return False, None
            if not same_direction and sign == query_fact_to_pm[id_only]: return False, None
    assert (direction_set)
    return True, same_direction
    

def get_compatible_facts(text_raw_fact, compacted_facts, snp_to_fact_id):
    """
    given the text of a raw fact, returns all compacted_facts ids in which this raw fact occurs
    add the sign of each of the compacted_facts id in which this raw fact occurs. Negative if it is not the same orientation as the text_raw_fact, else positive
    Example:
        * text_raw_fact: -10000l_0;92837_h;
        * compacted_facts: ... '29731': {'31938l', '499h'} ...  (factid -> list of non oriented alleles)
        * snp_to_fact_id: ... '31938': {'29731'} ...            (snp id non oriented -> list of fact ids in which the snp occurs)
    """
    subcompacedfacts=set()                      # Stores all facts potentially compatible with any of the snp of the text_raw_fact
    raw_fact_snps = set()                       # Stores the oriented alleles of the text_raw_fact. 
    result = set()                              # Stores the id of the compacted facts that are compatible with the input text_raw_fact
    text_raw_fact=text_raw_fact.rstrip(';')     #Avoids an empty value when splitting with ';'
    for oriented_allele in text_raw_fact.split(";"):
        snp_id_only=get_left_clean_snp(oriented_allele).split("_")[0][:-1]      # get the snp id non oriented
        if snp_id_only in snp_to_fact_id: # the snp may be absent in case it was removed by the sequence concatenation process. 
            subcompacedfacts=subcompacedfacts.union(snp_to_fact_id[snp_id_only])    # fill the subcompacedfacts with all facts in which the snp id non oriented occurs. 
            raw_fact_snps.add(oriented_allele.split("_")[0])
    for j in subcompacedfacts:
        cmpt,same_orientation = compatibles(raw_fact_snps, compacted_facts[j])
        if cmpt:
            if same_orientation:    result.add(int(j))
            else:                   result.add(-int(j))
            
    return result


get_allele_id = lambda x: x.split("_")[-1]+x.split("_")[1][0]   #>SNP_higher_path_9 to "9h"
get_coverage  = lambda x: int(x.split("_")[-1])                 #C1_31 to int(31)
def index_allele_coverage(raw_disco_fa_file_name, read_set_id, alleles_coverage = {}):
    """
    For each allele name (eg 21112l) provides its coverage in the considered read set)
    Returns a dictionary: allele_id -> coverage
    Used only by "detects_allele_coverage"
    """
    mfile = open(raw_disco_fa_file_name)
    # first variant rteated appart to recover the position of the coverage in which we are interested
    coverage_field=-1
    while True:
        comment = mfile.readline()
        if not comment: break
        if not comment.startswith(">SNP"): continue # do not deal with indels for now
        s_comment = comment.strip().split("|")
        mfile.readline()        # sequence we don't care
        for i,field_content in enumerate(s_comment):
            if field_content.startswith("C"+str(read_set_id)+"_"):
                coverage_field=i
                break
        assert coverage_field != -1, "Read set id "+str(read_set_id)+" not in "+comment
        allele_id = get_allele_id(s_comment[0])
        coverage  = get_coverage(s_comment[coverage_field])
        alleles_coverage[allele_id]=coverage
    
    
    
    # other variants
    while True:
        # >SNP_higher_path_9|P_1:30_C/T,P_2:35_T/G|high|nb_pol_2|left_unitig_length_31|right_unitig_length_66|C1_31|Q1_63|G1_0/1:398,14,458|rank_0
        # ggtgcagacaacccggcaggtgttgatgataAAGATCTGGTTAAATACGCCGATATTGGCGCGACTTACTATTTCAATAAAAACATGTCCACCTACGttgactataaaatcaacctgttggatgaagatgacagcttctacgctgccaatggcatctctaccg
        # >SNP_lower_path_9|P_1:30_C/T,P_2:35_T/G|high|nb_pol_2|left_unitig_length_31|right_unitig_length_66|C1_28|Q1_63|G1_0/1:398,14,458|rank_0
        # ggtgcagacaacccggcaggtgttgatgataAAGATCTGGTTAAATACGCCGATATTGGCGTGACTGACTATTTCAATAAAAACATGTCCACCTACGttgactataaaatcaacctgttggatgaagatgacagcttctacgctgccaatggcatctctaccg
        comment = mfile.readline()
        if not comment: break
        if not comment.startswith(">SNP"): continue # do not deal with indels for now
        comment = comment.strip().split("|")
        mfile.readline()        # sequence we don't care
        allele_id = get_allele_id(comment[0])
        coverage  = get_coverage(comment[coverage_field])
        alleles_coverage[allele_id]=coverage
        
    mfile.close()
    return alleles_coverage

def detects_allele_coverage(compacted_facts, raw_disco_co_file_name, raw_disco_unco_file_name, read_set_id):
    """
    Given the compacted facts indexed and the raw disco output: 
    for each compacted fact, find all allele that belong to it and store its coverage
    Returns a dictionary: compacted_fact_id -> allele_weight
    """
    alleles_coverage = index_allele_coverage(raw_disco_co_file_name, read_set_id)
    alleles_coverage = index_allele_coverage(raw_disco_unco_file_name, read_set_id, alleles_coverage)
    compacted_fact_allele_weight = {}              # For each compacted fact id, stores its weight
    for fact_id, fact_value in compacted_facts.items():
        compacted_fact_allele_weight[fact_id] = []

        for allele_id in fact_value: #{'10540l', '4734l', '29633h'}
            allele_coverage = alleles_coverage[get_left_clean_snp(allele_id)]
            compacted_fact_allele_weight[fact_id].append(allele_coverage)
    return compacted_fact_allele_weight

def detects_facts_coverage(compacted_facts, snp_to_fact_id, raw_facts_file_name):
    """
    Given the compacted facts indexed and the raw phasing information: for each compacted fact, find all facts that belong to it and compute its estimated coverage
    Returns a dictionary: compacted_fact_id -> weight
    """
    compacted_fact_weight = {}              # For each compacted fact id (int), stores its weight
    mfile = open(raw_facts_file_name)
    for line in mfile.readlines():
        if line[0]=="#" : continue          # comment
        coverage = int(line.strip().split("=>")[-1]) # -10011l_0;13979l_-57;21112l_-22;19270l_-14; => 4
        line=line.strip().split("=>")[0].split()     # remove coverage and split into two facts if needed
        for rawfact in line:
            if len(rawfact)==0: continue
            # detects all compacted_facts in which the rawfact occurs: 
            matching_compacted_fact_ids = get_compatible_facts(rawfact, compacted_facts, snp_to_fact_id)
            for matching_compacted_fact_id in matching_compacted_fact_ids: 
                matching_compacted_fact_id = abs(matching_compacted_fact_id) # remove the useless sign here
                if matching_compacted_fact_id not in compacted_fact_weight: 
                    compacted_fact_weight[matching_compacted_fact_id]=0
                compacted_fact_weight[matching_compacted_fact_id]+=coverage         
    mfile.close()
    return compacted_fact_weight       

def print_facts(phasing_file,compacted_fact_weight, compacted_fact_allele_weight):
    cpt=0
    mfile=open(phasing_file)
    for line in mfile:
        if line[0] != "S" : continue
        #S       0       -5001l;-8805h;-12869h;-25834l;-47306l;38133l;
        line=line.strip()
        compacted_fact_id=int(line.split()[1])
        fact_weight=0
        # assert (compacted_fact_id in compacted_fact_weight)
        fact_weight = compacted_fact_weight[compacted_fact_id]

        # assert(str(compacted_fact_id) in compacted_fact_allele_weight)

        alleles_weight = compacted_fact_allele_weight[str(compacted_fact_id)]
        



        print(f"{str(line)}\tFC:i:{str(fact_weight)}\tmin:{min(alleles_weight)}\tmax:{max(alleles_weight)}\tmean:{sum(alleles_weight)/len(alleles_weight)}\tAC:{';'.join(str(e) for e in alleles_weight)};")
        # print(line+"\tFC:i:"+str(fact_weight)+"\tRC:i:"+str(alleles_weight[1])+"\tmin:i:"+str(alleles_weight[0])+"\tmax:i:"+str(alleles_weight[1])+"\tmean:i:"+str(alleles_weight[2]))  ## RC is max ([1]) as we take the max (17/02/2020)
        cpt+=1
    sys.stderr.write(str(cpt)+" facts written\n")
    mfile.close()

def add_pair_edge (pair_edges, left, right, abundance):
    """
    Add a new pair to all stored pairs. Creates cells and dictionary if needed
    Stores a canonical representation of pairs (smallest id left)
    """
    if left > right: left, right = right, left # swap values if left > right, in order to avoid a -> b and -b -> -a
    if left not in pair_edges: pair_edges[left]={}
    if right not in pair_edges[left]: pair_edges[left][right]=0
    pair_edges[left][right]+=abundance


def detects_pairs_of_linked_compacted_paths(compacted_facts, snp_to_fact_id, raw_facts_file_name, fact_overlaps):
    """
    given the compacted facts indexed and the raw phasing information, detects pairs of facts that are co-mapped by at least one pair of paired non compacted facts
    returns a dictionary compacted_fact_id -> {compacted_fact_id} such that the key is smaller than all values.
    """
    pair_edges = {}                         # For each "left" (arbitrary) compacted fact (key) link to a dictionnary right compacted fact -> number of occurrences 
    mfile = open(raw_facts_file_name)
    for line in mfile.readlines():
        if line[0]=="#" : continue          # comment
        abundance = int(line.strip().split("=>")[1])
        line=line.strip().split("=>")[0]    # remove coverage
        line=line.strip().split()        # two pairs
        if len(line)<2: continue            # we consider only pairs of facts
        # for the first fact, detects all compacted_facts in which it occurs: 
        left_compacted_facts = get_compatible_facts(line[0], compacted_facts, snp_to_fact_id)
        # for the second fact, detects all compacted_facts in which it occurs: 
        right_compacted_facts = get_compatible_facts(line[1], compacted_facts, snp_to_fact_id)
        ### if compacted facts matched, make all pairs
        if len(left_compacted_facts)>0 and len(right_compacted_facts)>0:
            for left_compacted_fact_id in left_compacted_facts:
                for right_compacted_fact_id in right_compacted_facts:
                    if left_compacted_fact_id in fact_overlaps and right_compacted_fact_id in fact_overlaps[left_compacted_fact_id]:
                        continue    # this pair only retreives two facts that overlap
                    if right_compacted_fact_id in fact_overlaps and left_compacted_fact_id in fact_overlaps[right_compacted_fact_id]:
                        continue    # this pair only retreives two facts that overlap

                    add_pair_edge(pair_edges, left_compacted_fact_id, right_compacted_fact_id, abundance)

    mfile.close()
    return pair_edges 
    
def print_pair_edges_gfa_style(pair_edges, occurrence_min=1):
    cpt=0
    for left_fact_id in pair_edges:
        left_sign = ''
        if left_fact_id<0:  left_sign = '-'
        else:               left_sign = '+'
        abs_left_fact_id = abs(left_fact_id)
        for right_fact_id in pair_edges[left_fact_id]:
            # the right sign is reversed as pairend reads map -->  <--. Hence if right is forward on a fact we have to reverse this fact
            right_sign = ''
            if right_fact_id<0:     right_sign = '+'
            else:                   right_sign = '-'
            abs_right_fact_id = abs(right_fact_id)
            
            
            
            if pair_edges[left_fact_id][right_fact_id]>=occurrence_min:
                cpt+=1
                print ("L\t"+str(abs_left_fact_id)+"\t"+left_sign+"\t"+str(abs_right_fact_id)+"\t"+right_sign+"\t0M\tFC:i:"+str(pair_edges[left_fact_id][right_fact_id]))
    sys.stderr.write(str(cpt)+" paired fact written\n")
    
def print_facts_overlaps(phasing_file):
    fact_overlaps={}
    mfile=open(phasing_file)
    cpt=0
    for line in mfile:
        if line[0] != "L" : continue
        cpt+=1
        #L       17012   -       23084   +       2M
        line=line.strip()
        print(line)
        # store pairs: 
        source = int(line.split()[1])
        target = int(line.split()[3])
        if source > target: 
            tmp = source
            source = target
            target = tmp
        if source not in fact_overlaps: 
            fact_overlaps[source]=set()
        fact_overlaps[source].add(target)

    sys.stderr.write(str(cpt)+" facts overlaps written\n")
    return fact_overlaps
    
    
    
    
    

def detects_pairs_of_edges_sharing_snp(compacted_facts, snp_to_fact_id):
    """ 
    detects which facts share at least a snp id with incompatible h/l
    returns a dictionary fact_id -> set(fact_ids) (key is lower than any fact in the value)
    """
    facts_shared_snps = {}
    for key,values in compacted_facts.items():
        for oriented_allele in values:
            snp_id_only=get_left_clean_snp(oriented_allele).split("_")[0][:-1]      # get the snp id non oriented
            horl = get_left_clean_snp(oriented_allele).split("_")[0][-1]            # get the path 'h' or 'l' of the SNP
            # print("snp_id_only",snp_id_only)
            if snp_id_only in snp_to_fact_id: # the snp may be absent in case it was removed by the sequence concatenation process. 
                for fact_id in snp_to_fact_id[snp_id_only]:
                    if int(fact_id)<=int(key): 
                        continue
                    fact = compacted_facts[fact_id]
                    for snp_id in fact: 
                        ### checks that h or l values are disctincts between the two facts 
                        if get_left_clean_snp(snp_id).split("_")[0][:-1] == snp_id_only and get_left_clean_snp(snp_id).split("_")[0][-1]!=horl: # TO VALIDATE 9/9/2019
                            if key not in facts_shared_snps: facts_shared_snps[key] = set()    
                            facts_shared_snps[key].add(fact_id)
    # for key, value in facts_shared_snps.items():
    #     print(key,value)
    return facts_shared_snps
            
def print_pairs_of_edges_sharing_snp(facts_shared_snps):
    cpt=0
    for key, values in facts_shared_snps.items():
        for value in values: 
            print("L\t"+key+"\t+\t"+value+"\t+\t-2M")
            cpt+=1
    sys.stderr.write(str(cpt)+" pairs of facts sharing at least one snp written\n")


def main (phasing_file,raw_facts_file_name, raw_disco_co_file_name, raw_disco_unco_file_name, read_set_id):
    sys.stderr.write("#INDEX FACTS\n")
    compacted_facts, snp_to_fact_id = set_indexes_from_gfa(phasing_file)
    
    sys.stderr.write("#COMPUTE THE COMPACTED FACT COVERAGES\n")
    compacted_fact_weight = detects_facts_coverage(compacted_facts, snp_to_fact_id, raw_facts_file_name)
    
    sys.stderr.write("#COMPUTE THE COMPACTED FACT ALLELE COVERAGES\n")
    compacted_fact_allele_weight=detects_allele_coverage(compacted_facts, raw_disco_co_file_name, raw_disco_unco_file_name, read_set_id)
    
    sys.stderr.write("#PRINT COMPACTED FACTS \n")
    print_facts(phasing_file,compacted_fact_weight, compacted_fact_allele_weight)
    
    sys.stderr.write("#PRINT COMPACTED FACT OVERLAPS \n")
    fact_overlaps=print_facts_overlaps(phasing_file)

    sys.stderr.write("#COMPUTE PAIRS OF COMPACTED FACT GRAPH\n")
    pair_edges=detects_pairs_of_linked_compacted_paths(compacted_facts, snp_to_fact_id, raw_facts_file_name,fact_overlaps)
    
    sys.stderr.write("#PRINT EDGES OF COMPACTED FACT GRAPH\n")
    print_pair_edges_gfa_style(pair_edges) 
    
    sys.stderr.write("#COMPUTE THE FACTS SHARING AT LEAST ONE SNP\n")
    facts_shared_snps = detects_pairs_of_edges_sharing_snp(compacted_facts, snp_to_fact_id)
    print_pairs_of_edges_sharing_snp(facts_shared_snps)
    
    
if __name__ == "__main__":
    main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4],  sys.argv[5]) # compacted_facts.gfa phased_alleles_read_set_id_1.txt discoRes_k_31_c_2_D_0_P_3_b_2_coherent.fa discoRes_k_31_c_2_D_0_P_3_b_2_uncoherent.fa 1