File: format_phased_variants_for_haplotyping.py

package info (click to toggle)
discosnp 4.4.4-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,264 kB
  • sloc: python: 4,986; cpp: 3,717; sh: 3,205; makefile: 12
file content (122 lines) | stat: -rw-r--r-- 5,583 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import sys
#usage: 
### first create connected components from disco (-A option)
#sh from_phased_alleles_to_clusters.sh phased_alleles_read_set_id_1.txt # creates file connected_components_phased_alleles_read_set_id_1.txt
### them from the .fa file, the id of the set your interested in (e.g. 1 for phased_alleles_read_set_id_1.txt, this will correspond to C1 coverage in the fa file), the file containing the connected components, and the phased_alleles_read_set_id_X.txt file, generate the fact file
#python3 format_phased_variants_for_haplotyping.py mapping_k_31_c_auto_D_100_P_10_b_0_coherent.fa 1 connected_components_phased_alleles_read_set_id_1.txt phased_alleles_read_set_id_1.txt  > phased_alles_read_set_1_facts.txt


if not len(sys.argv)==5:
    print ("usage: python3 format_phased_variants_for_haplotyping.py <file coherent.fa> <id number><connected_component_file><phased_allele_file>")
    print (" * coherent.fa file: the file generated by discoSnp")
    print (" * id number is the id of the read set, for which variants are phased. With i, this corresponds to Ci in the .fa file headers.")
    print (" * connected_component_file: file obtained from \"from_phased_alleles_to_clusters.sh phased_alleles_read_set_id_1.txt\" continaing connected component of phased alleles")
    print (" * phased_alleles_read_set_id_1.txt: file generated by discoSnp (with the hidden -A option. The value 1 here shoud correspond to \"id number\"")
    sys.exit(0)

coherent_fa_file = open(sys.argv[1])
set_id = sys.argv[2]
cc_file = open(sys.argv[3])
phased_alleles_file = open(sys.argv[4])



def store_abundances(coherent_fa_file,set_id):
    pos_coverage_determined=False
    pos_coverage=-1
    coverages={}
    for oline in coherent_fa_file: #>SNP_higher_path_991|P_1:30_C/G|high|nb_pol_1|C1_38|C2_0|Q1_0|Q2_0|G1_0/0:6,119,764|G2_1/1:664,104,6|rank_1
        if oline[0] != '>': continue
        line=oline.rstrip().split('|')
        id=line[0].split('_')[-1]    #here 991
        id+=line[0].split('_')[1][0] #'h' or 'l'
        if not pos_coverage_determined:
            for pos_coverage in range(len(line)):
                if line[pos_coverage][0]=='C':
                    value=line[pos_coverage][1:].split('_')[0]
                    if value==set_id:
                        pos_coverage_determined=True
                        break
            if not pos_coverage_determined:
                print ("Set id", set_id, "not findable in header like ", oline.rstrip())
                print ("ciao")
                sys.exit(0)
        coverages[id]=line[pos_coverage].split('_')[1] # get the right coverage corresponding to the searche read set
    return coverages
    

def store_cc(cc_file):
    cc={}
    for i,oline in enumerate (cc_file): # 852 1891 3484 2641 5758 3247
        oline=oline.rstrip().split()
        for idf in oline: 
            idf=int(idf)
            if idf in cc:
                print("ERROR, idf is in more than one connected component")
            cc[idf]=i
    return cc
        


def store_phased_alleles(phased_alleles_file):
    phased_alleles={}
    for oline in phased_alleles_file: #-1187h;1001h;2178h; => 5
        oline=oline.lstrip().rstrip()
        if oline[0]=='#': continue
        ids = oline.split(' ')[0].split(';')[:-1]
        abundance = int(oline.split(' ')[-1])
        idlist=[]
        for aid in ids: 
            # if aid[0]=='-': # remove the '-'
            #     aid=aid[1:]
            idlist.append(aid)
        # canonical representation: smallest first (removing with the strip function the eventual first '-' sign): 
        if int(idlist[0].strip('-')[:-1])>int(idlist[-1].strip('-')[:-1]):
            idlist.reverse()
            # change the ortientation = change the sign: 
            for i in range(len(idlist)):
                if idlist[i][0]=='-': idlist[i]=idlist[i][1:]
                else: idlist[i]='-'+idlist[i]

        list_as_string = ""
        for aid in idlist:
            list_as_string+=aid+';'
        # add the list to the phased_alleles or increase its count if not existing:
        if list_as_string in phased_alleles: 
            phased_alleles[list_as_string]+=abundance
        else: 
            phased_alleles[list_as_string]=abundance

                
    return phased_alleles

    
def print_djack_formated_phased_variants(coverages,cc,phased_alleles):
    for aid in coverages:
        current_snp_id=int(aid[:-1])
        if current_snp_id in cc:
            print("snp(cc"+str(cc[current_snp_id])+","+str(current_snp_id)+","+aid[-1]+","+str(coverages[aid])+").")
    for i,list_as_string in enumerate(phased_alleles):#'2686l;4324h;5375h;': 3
        # get the CC: 
        ids=list_as_string.split(';')[:-1]
        abundance = phased_alleles[list_as_string]
        first_id=abs(int(ids[0][:-1]))
        if first_id not in cc: continue
        this_cc=cc[first_id]
        for j in range(1,len(ids)):
            if abs(int(ids[j][:-1])) in cc and cc[abs(int(ids[j][:-1]))] != this_cc:
                print("impossible all variants from ",list_as_string, "are not in the same CC")
                sys.exit(0)
        
            
        for node_order,aid in enumerate(ids):
            print("fact(cc"+str(this_cc)+","+str(i)+","+str(node_order+1)+","+aid[:-1]+","+aid[-1]+").")
        print("count("+str(i)+","+str(abundance)+").")
        
            

coverages=store_abundances(coherent_fa_file,set_id)

cc=store_cc(cc_file)
phased_alleles=store_phased_alleles(phased_alleles_file)
print_djack_formated_phased_variants(coverages,cc,phased_alleles)