File: 1SNP_per_cluster.py

package info (click to toggle)
discosnp 1%3A2.6.2-4
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 3,652 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 12
file content (176 lines) | stat: -rwxr-xr-x 5,166 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python
# -*- coding: utf-8 -*-


''' ***********************************************

Script to select only one variant per locus (cluster) in a discoSnp vcf output file (.vcf)
Author - Claire Lemaitre, Pierre Peterlongo, Inria

Usage:
python3 1SNP_per_cluster.py -i vcf_file [-o new_vcf_file]

Details:
The selected variant is not chosen at random, for a given cluster, it is the one with the less missing genotypes (if ties, it is the first read in the input file)

*********************************************** '''

import sys
import getopt


    
def usage():
    '''Usage'''
    print("-----------------------------------------------------------------------------")
    print(sys.argv[0]+" : selects one variant per cluster")
    print("-----------------------------------------------------------------------------")
    print("usage: "+sys.argv[0]+" -i vcf_file [-o output_file]")
    print("  -i: vcf file [mandatory]")
    print("  -o: output vcf file (default = stdout)")
    print("  -h: help")
    print("-----------------------------------------------------------------------------")
    sys.exit(2)
    

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hi:o:")
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err))
        usage()
        sys.exit(2)

    # Default parameters
    vcf_file =       None
    out_file =      None
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
        elif opt in ("-i"):
            vcf_file = arg
        elif opt in ("-o", "--out"):
            out_file = arg
        else:
            assert False, "unhandled option"

    if vcf_file==None:
        print ("Error: option -i is mandatory")
        usage()
        sys.exit(2)

    format_ok = check_format(vcf_file)
    if not format_ok:
        print("Error: the format of the input vcf is not correct, it must contain clustering information")
        sys.exit(2)
    
    dict_, nb_SNP_before, nb_SNP_after = store_info(vcf_file)
    output_newvcf(vcf_file, out_file, dict_)




def check_format(vcf_file):
    ''' Checks if the vcf has the correct format, ie : the INFO field must contain clustering information, such as:
        Ty=SNP;Rk=1;UL=1;UR=2;CL=.;CR=.;Genome=.;Sd=.;Cluster=79466;ClSize=12
        '''
        
    filin = open(vcf_file, 'r')
    checked = False
    while not checked:
        line = filin.readline()
        if line.startswith("#"): continue
        INFO_split = line.split("\t")[7].split(";")
        checked = True
        if len(INFO_split) < 10: return False
        tmp_cluster = INFO_split[8].split("Cluster=")
        if len(tmp_cluster) < 2: return False
        if tmp_cluster[1] == ".": return False
        try:
            cl_id = int(tmp_cluster[1])
        except ValueError:
            return False
        return True
                
    filin.close()

def store_info(vcf_file):

    dict_ = {}  ## {num_cluster : [num_SNP ayant le moins de géno manquants, nb_missgeno]}
    filin = open(vcf_file, 'r')
    nb_SNP_tot = 0

    while True:
        """#SNP_higher_path_14643	30	14643	C	T	.	.	Ty=SNP;Rk=0.55424;UL=0;UR=0;CL=0;CR=0;Genome=.;Sd=.;Cluster=1285;ClSize=4	GT:DP:PL:AD:HQ	0/1:38:554,48,75:7,31:71,71	0/1:20:63,23,263:15,5:71,71"""

        line = filin.readline()
        if not line: break
        if line.startswith("#"): continue

        nb_SNP_tot += 1

        num_cluster = int(line.split("Cluster=")[1].split(";")[0])
          
        if num_cluster == -1: continue
        id_SNP = line.split("\t")[2]

        if num_cluster not in dict_:
            dict_[num_cluster] = ["x", sys.maxsize]                             # ghost best variant for this new cluster

        genotypes = [i.split(":")[0] for i in line.split("\t")[9:]]
        nb_missing = 0

        for ind in genotypes:
            if ind[0] == "." : 
                nb_missing += 1
            
       
        if nb_missing >= dict_[num_cluster][1]: continue
        dict_[num_cluster] = [id_SNP, nb_missing]

    filin.close()
    return dict_, nb_SNP_tot, len(dict_)



def output_newvcf(vcf_file, out_file, dict_) :
    
    filin = open(vcf_file, 'r')
    if out_file:
        filout=open(out_file,'w')
    else:
        filout = sys.stdout
        

    while True:

        line = filin.readline() #SNP_higher_path_14643	30	14643	C	T	.	.	Ty=SNP;Rk=0.55424;UL=0;UR=0;CL=0;CR=0;Genome=.;Sd=.;Cluster=1285;ClSize=4	GT:DP:PL:AD:HQ	0/1:38:554,48,75:7,31:71,71	0/1:20:63,23,263:15,5:71,71
        if not line: break
        if line.startswith("#"): 
             filout.write(line)        
             continue

        try:
            cluster = int(line.split("Cluster=")[1].split(";")[0])
        except ValueError:
                print ("No cluster size information stored in the vcf, exit")
                sys.exit(1)
        if cluster == -1 :  continue
 
        id_SNP = line.split("\t")[2]       
        if id_SNP != dict_[cluster][0]: continue
       
        filout.write(line)

    filin.close()
    filout.close()


    
    
if __name__ == "__main__":
    main()