File: VCF_creator.py

package info (click to toggle)
discosnp 2.3.0-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 2,036 kB
  • sloc: cpp: 3,342; python: 1,833; sh: 1,420; makefile: 15
file content (169 lines) | stat: -rwxr-xr-x 7,318 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/bin/python
# -*- coding: utf-8 -*-
#*****************************************************************************
#   VCF_Creator: mapping and VCF creation feature in DiscoSnp++
#   Copyright (C) 2015  INRIA
#   Author: C.Riou
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as
#  published by the Free Software Foundation, either version 3 of the
#  License, or (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
#*****************************************************************************
import os
import sys
import subprocess
import re
import time
import getopt
from functionObjectVCF_creator import *
from ClassVCF_creator import *



#Default value
listName=[] #List from the file name used to create the VCF
nbSnp=0 #number of SNP in the input file
nbGeno=0 #number of genotype for every path
filtered_sam=False
filtered_sam_file=""
#Help
def usage():
    usage= """
    ################################
        Run VCF_creator
    ################################
    
    -h --help : print this message
    -s --sam_file : <file>.sam of the alignment

    -o --output : vcf file 
    -f --output_filtered_SAM : if provided, a SAM file in which uncorrectly mapped prediction (corresponding to filter '.' in the provided VCF) are removed is output in this file.
    
    """
    print(usage)
###OPTIONS 
try:
    opts, args = getopt.getopt(sys.argv[1:],"h:s:o:f:",["help","sam_file=","output=","output_filtered_SAM="])
    if not opts:
        usage()
        sys.exit(2)
except getopt.GetoptError as e:
    print(e)
    usage()
    sys.exit(2)
for opt, arg in opts : 
    if opt in ("-h", "--help"):
        usage()
        sys.exit(2)
    elif opt in ("-s","--sam_file"):
        boolmyname=False
        fileName=arg
        if os.path.isfile(fileName):#checks if the file exists
               listNameFile=fileName.split(".")
               if "BWA_OPT" in listNameFile[0]: #When the stream_file is created by run_VCF_creator.sh ; it adds BWA_OPT to separate the name of the file and the BWA options
                       boolmyname=True #Boolean to know if the file name contains the option of BWA
                       listName=listNameFile[0].split("BWA_OPT") #Parsing of the file name listName[0] corresponds to the name of the discofile ; listName[1] corresponds to the bwa options
                       listName[1]=listName[1].replace("_", " ")
               stream_file=open(fileName,'r')
        else : 
              print(("!! No file :" +str(arg)+"!!"))
              sys.exit(2)  
    elif opt in ("-o","--output"):
        if arg!=None:
                VCFFile = open(arg,'w')
        else :
                print("!! No output !!")
                sys.exit(2)      
    elif opt in ("-f","--output_filtered_SAM"):
        if arg!=None:
            filtered_sam = True
            filtered_sam_file = open(arg,'w')
        else:
            print("!! No filtered sam output !!")
            sys.exit(2)      
    else:
        print(("Unkwnown option {} ".format(opt)))
        usage()
        sys.exit(2)

#Creates the VCF file and prints the header into it 
PrintVCFHeader(VCFFile,listName,fileName,boolmyname)
nbGeno=CounterGenotype(fileName)
#---------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------
#Generates the field index (first occurrence of "contig", and so on)
fieldIndex=GetIndex(fileName)


#---------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------
#Start to read the file two lines by two lines
if ".sam" in fileName: #Checks if it's a stream_file
        while True:
                line1=stream_file.readline()   
                if not line1: break #End of file
                if line1.startswith('@'):
                        if filtered_sam:
                                filtered_sam_file.write(line1) 
                        continue #We do not read headers                
                while True:
                        listline1=line1.split("\t")  
                        if int(listline1[1]) & 2048 :#checks if it's not a secondary alignment => means splitted aligned sequence 
                                line1=stream_file.readline()
                        else:break
                line2=stream_file.readline() #Read couple of lines
                while True:
                        listline2=line2.split("\t") 
                        if int(listline2[1]) & 2048 :#checks if it's not a secondary alignment => means splitted aligned sequence 
                                line2=stream_file.readline()
                        else:break       
                #Initializes variant object with the samline
                #if InitVariant(line1,line2)[0]==1:
                #        continue
                variant_object, vcf_field_object=InitVariant(line1,line2,fileName, fieldIndex) #Fills the object with the line of the stream_file        
                if variant_object.CheckCoupleVariantID()==1: #Checks whether the two lines are from the same path
                        sys.exit(1)
                #Checks the mapping on reference and determines the shift with the reference, which path is the reference ...
                table=MappingTreatement(variant_object,vcf_field_object,nbGeno)
                #Added by Pierre Peterlongo, Sept 2015. Outputs a new SAM file corresponding to mapped sequences (PASS or MULTIPLE)
                if(filtered_sam and vcf_field_object.filterField!="."):
                    filtered_sam_file.write(line1)
                    filtered_sam_file.write(line2)
                
                #Fills the VCF file
                variant_object.FillVCF(VCFFile,nbGeno,table,vcf_field_object)
                  

            

elif ".fa" in fileName: #Treatement of the fasta file (no mapping information)
        while True:
                line1=stream_file.readline()
                if not line1: break #End of file
                seq1=stream_file.readline() #Reads the seq associate to the SNP
                line2=stream_file.readline() #Reads a couple of line
                seq2=stream_file.readline()
                #Initializes variant object with the samline
                variant_object, vcf_field_object=InitVariant(line1,line2,fileName, fieldIndex)
                table=UnmappedTreatement(variant_object,vcf_field_object,nbGeno,seq1,seq2)
                variant_object.FillVCF(VCFFile,nbGeno,table,vcf_field_object)


VCFFile.close()
stream_file.close()
if filtered_sam:
    filtered_sam_file.close()