File: VCF_creator.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (184 lines) | stat: -rwxr-xr-x 9,307 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#*****************************************************************************
#   VCF_Creator: mapping and VCF creation feature in DiscoSnp++
#   Copyright (C) 2015  INRIA
#   Author: C.Riou
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as
#  published by the Free Software Foundation, either version 3 of the
#  License, or (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
#*****************************************************************************
import os
import sys
import subprocess
import re
import time
import getopt
from functionObjectVCF_creator import *
from ClassVCF_creator import *


                

#Help
def usage():
        usage= """
        ################################
                Run VCF_creator
        ################################
        
        -h --help : print this message
        -s --sam_file : <file>.sam of the alignment

        -o --output : vcf file 
        -f --output_filtered_SAM : if provided, a SAM file in which uncorrectly mapped prediction (corresponding to filter '.' in the provided VCF) are removed is output in this file.
        
        """
        print(usage)
        
def main():
        #Default value
        listName=[] #List from the file name used to create the VCF
        nbGeno=0 #number of genotype for every path
        filtered_sam=False
        filtered_sam_file=""
        VCFFile = None
        ###OPTIONS 
        try:
                opts, args = getopt.getopt(sys.argv[1:],"s:o:f:h",["help","sam_file=","output=","output_filtered_SAM="])
                if not opts:
                        usage()
                        sys.exit(2)
        except getopt.GetoptError as e:
                sys.stderr.write(f"--- {e}\n")
                usage()
                sys.exit(2)
        for opt, arg in opts : 
                if opt in ("-h", "--help"):
                        usage()
                        sys.exit(2)
                elif opt in ("-s","--sam_file"):
                        boolmyname=False
                        fileName=arg
                        if os.path.isfile(fileName):#checks if the file exists
                                listNameFile=fileName.split(".")
                                if "BWA_OPT" in listNameFile[0]: #When the stream_file is created by run_VCF_creator.sh ; it adds BWA_OPT to separate the name of the file and the BWA options
                                        boolmyname=True #Boolean to know if the file name contains the option of BWA
                                        listName=listNameFile[0].split("BWA_OPT") #Parsing of the file name listName[0] corresponds to the name of the discofile ; listName[1] corresponds to the bwa options
                                        listName[1]=listName[1].replace("_", " ")
                                stream_file=open(fileName,'r')
                        else : 
                                sys.stderr.write(f"File \"{arg}\" does not exist")
                                sys.exit(2)  
                elif opt in ("-o","--output"):
                        if not arg:
                                sys.stderr.write("-o needs a value\n")
                                sys.exit(2)
                        
                        VCFFile = open(arg,'w')
                elif opt in ("-f","--output_filtered_SAM"):
                        if arg!=None:
                                filtered_sam = True
                                filtered_sam_file = open(arg,'w')
                        else:
                                sys.stderr.write("!! No filtered sam output !!\n")
                                sys.exit(2)          
                else:
                        sys.stderr.write(f"Unknown option {format(opt)}\n")
                        usage()
                        sys.exit(2)

        if not VCFFile:
                VCFFile = sys.stdout
                
        #Creates the VCF file and prints the header into it 
        PrintVCFHeader(VCFFile,listName,fileName,boolmyname)
        nbGeno=CounterGenotype(fileName)
        #---------------------------------------------------------------------------------------------------------------------------
        #---------------------------------------------------------------------------------------------------------------------------
        #Generates the field index (first occurrence of "contig", and so on)
        fieldIndex=GetIndex(fileName)


        #---------------------------------------------------------------------------------------------------------------------------
        #---------------------------------------------------------------------------------------------------------------------------
        #Start to read the file two lines by two lines
        if ".sam" in fileName: #Checks if it's a stream_file
                while True:
                        line1=stream_file.readline() 
                        if not line1: break #End of file  
                        line1 = line1.strip()
                        if line1.startswith('@'):
                                if filtered_sam:
                                        filtered_sam_file.write(line1) 
                                continue #We do not read headers                                
                        
                        while True:
                                listline1=line1.split("\t") 
                                if len(listline1) > 1 and int(listline1[1]) & 2048 :#checks if it's not a secondary alignment => means splitted aligned sequence 
                                        line1=stream_file.readline()
                                else: break
                        line2=stream_file.readline().strip() #Read couple of lines
                        while True:
                                listline2=line2.split("\t") 
                                if len(listline2) > 1 and int(listline2[1]) & 2048 :#checks if it's not a secondary alignment => means splitted aligned sequence 
                                        line2=stream_file.readline()
                                        if not line2: break # happens when second part of the previous pair (last one) add an unread secondary alignment
                                else: break           
                        #Initializes variant object with the samline
                        #if InitVariant(line1,line2)[0]==1:
                        #                continue
                        if not line2: break
                        variant_object, vcf_field_object=InitVariant(line1,line2,fileName, fieldIndex) #Fills the object with the line of the stream_file                
                        if variant_object.CheckCoupleVariantID()==1: #Checks whether the two lines are from the same path
                                sys.exit(1)
                        #Checks the mapping on reference and determines the shift with the reference, which path is the reference ...
                        table=MappingTreatement(variant_object,vcf_field_object,nbGeno)
                        #Added by Pierre Peterlongo, Sept 2015. Outputs a new SAM file corresponding to mapped sequences (PASS or MULTIPLE)
                        if(filtered_sam and vcf_field_object.filterField!="."):
                                filtered_sam_file.write(line1)
                                filtered_sam_file.write(line2)
                
                        #Fills the VCF file
                        variant_object.FillVCF(VCFFile,nbGeno,table,vcf_field_object)


                        

        elif ".fa" in fileName: #Treatement of the fasta file (no mapping information)
                while True:
                        line1=stream_file.readline()
                        if not line1: break #End of file
                        seq1=stream_file.readline() #Reads the seq associate to the variant
                        line2=stream_file.readline() #Reads a couple of line
                        seq2=stream_file.readline()
                        #Initializes variant object with the samline
                        variant_object, vcf_field_object=InitVariant(line1,line2,fileName, fieldIndex)
                        table=UnmappedTreatement(variant_object,vcf_field_object,nbGeno,seq1,seq2)
                        variant_object.FillVCF(VCFFile,nbGeno,table,vcf_field_object)


        VCFFile.close()
        stream_file.close()
        if filtered_sam:
                filtered_sam_file.close()





if __name__ == '__main__':
        main()
        sys.stderr.write(f"VCF file had been generated. Beware, this is a zero-based VCF file.\n")
        sys.stderr.write(f"Please use 'zero2one.py' script if you wish to obtain a one-based VCF.\n")