File: redundancy_removal_discosnp.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (94 lines) | stat: -rwxr-xr-x 3,645 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/python3
# -*- coding: utf-8 -*-
###################################
# from kissnp output: 
#   check pair of variants that start with the same kmer and keep only one bubble
#   check pair of variants that end with the same kmer and keep only one bubble
import sys

def get_first_kmer(seq,k):
    for i in range (len(seq)):
        if seq[i]>='a' and seq[i]<='z': continue
        return seq[i:i+k]


def get_last_kmer(seq,k):
    variant_seen=False
    for i in range (len(seq)):
        if seq[i]>='a' and seq[i]<='z':
            if not variant_seen: continue # nothin to do, continue
            else: # first position after upper case variant:
                return seq[i-k:i]
        else: variant_seen=True
    return seq[i-k+1:]

def add_id(kmer_to_var_id,kmer,var_id):
    if kmer not in kmer_to_var_id:
        kmer_to_var_id[kmer]=[]
    kmer_to_var_id[kmer].append(var_id)
        
def non_empty_intersection(a,b):
    sa = set(a)
    sb = set(b)
    if len(sa.intersection(sb)) >0: return True
    return False
    

def parse(fafile,k,faout):
    start_kmer_to_var_id={}
    stop_kmer_to_var_id ={}
    i=1
    printed=0
    while (True):
        if i%100==0 : 
            sys.stdout.write ("\r"+str(i)+ " bubbles treated, "+str(printed)+" bubbles non redundant")
            sys.stdout.flush()
        i+=1
        com1=fafile.readline()
        if not com1: break
        com1=com1.rstrip() #>SNP_higher_path_1|P_1:30_T/G|high|nb_pol_1|left_unitig_length_22|right_unitig_length_0
        seq1=fafile.readline().rstrip()
        com2=fafile.readline().rstrip()
        seq2=fafile.readline().rstrip()
        
        # get variant_id: 
        var_id=com1.split("|")[0].split("_")[-1]
        
        # deal with starting kmer
        kmer_start1=get_first_kmer(seq1,k)
        kmer_start2=get_first_kmer(seq2,k)
        if kmer_start1 in start_kmer_to_var_id and kmer_start2 in start_kmer_to_var_id:
            list_var_id_kmer_start1 = start_kmer_to_var_id[kmer_start1]
            list_var_id_kmer_start2 = start_kmer_to_var_id[kmer_start2]
            if non_empty_intersection(list_var_id_kmer_start1,list_var_id_kmer_start2): continue # this variant has already been seen with another context
        
        # deal with ending kmer
        kmer_stop1=get_last_kmer(seq1,k)
        kmer_stop2=get_last_kmer(seq2,k)
        # print (kmer_stop1,kmer_stop2)
        if kmer_stop1 in stop_kmer_to_var_id and kmer_stop2 in stop_kmer_to_var_id:
            list_var_id_kmer_stop1 = stop_kmer_to_var_id[kmer_stop1]
            list_var_id_kmer_stop2 = stop_kmer_to_var_id[kmer_stop2]
            if non_empty_intersection(list_var_id_kmer_stop1,list_var_id_kmer_stop2): continue  # this variant has already been seen with another context
        
        add_id(start_kmer_to_var_id,kmer_start1,var_id)
        add_id(start_kmer_to_var_id,kmer_start2,var_id)
        add_id(stop_kmer_to_var_id, kmer_stop1, var_id)
        add_id(stop_kmer_to_var_id, kmer_stop2, var_id)
        faout.write(com1+"\n")
        faout.write(seq1+"\n")
        faout.write(com2+"\n")
        faout.write(seq2+"\n")
        printed+=1
    sys.stdout.write ("\r"+str(i)+ " bubbles treated, "+str(printed)+" bubbles non redundant\n")

if len(sys.argv) !=4: 
    print ("Script "+ sys.argv[0].split("/")[-1])
    print ("  From a discoSnp .fa output: from all variants that start or stop with the same kmer, keep only one of their occurrences")
    print ("  usage: "+ sys.argv[0].split("/")[-1]+ " input.fa k_value output.fa")
    exit(1)
fafile=open(sys.argv[1],"r")
k=int(sys.argv[2])
faout=open(sys.argv[3],"w")
parse(fafile,k,faout)