1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#*****************************************************************************
# VCF_Creator: mapping and VCF creation feature in DiscoSnp++
# Copyright (C) 2015 INRIA
# Author: C.Riou
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#*****************************************************************************
import os
import sys
import subprocess
import re
import time
import getopt
from functionObjectVCF_creator import *
from ClassVCF_creator import *
#Help
def usage():
usage= """
################################
Run VCF_creator
################################
-h --help : print this message
-s --sam_file : <file>.sam of the alignment
-o --output : vcf file
-f --output_filtered_SAM : if provided, a SAM file in which uncorrectly mapped prediction (corresponding to filter '.' in the provided VCF) are removed is output in this file.
"""
print(usage)
def main():
#Default value
listName=[] #List from the file name used to create the VCF
nbGeno=0 #number of genotype for every path
filtered_sam=False
filtered_sam_file=""
VCFFile = None
###OPTIONS
try:
opts, args = getopt.getopt(sys.argv[1:],"s:o:f:h",["help","sam_file=","output=","output_filtered_SAM="])
if not opts:
usage()
sys.exit(2)
except getopt.GetoptError as e:
sys.stderr.write(f"--- {e}\n")
usage()
sys.exit(2)
for opt, arg in opts :
if opt in ("-h", "--help"):
usage()
sys.exit(2)
elif opt in ("-s","--sam_file"):
boolmyname=False
fileName=arg
if os.path.isfile(fileName):#checks if the file exists
listNameFile=fileName.split(".")
if "BWA_OPT" in listNameFile[0]: #When the stream_file is created by run_VCF_creator.sh ; it adds BWA_OPT to separate the name of the file and the BWA options
boolmyname=True #Boolean to know if the file name contains the option of BWA
listName=listNameFile[0].split("BWA_OPT") #Parsing of the file name listName[0] corresponds to the name of the discofile ; listName[1] corresponds to the bwa options
listName[1]=listName[1].replace("_", " ")
stream_file=open(fileName,'r')
else :
sys.stderr.write(f"File \"{arg}\" does not exist")
sys.exit(2)
elif opt in ("-o","--output"):
if not arg:
sys.stderr.write("-o needs a value\n")
sys.exit(2)
VCFFile = open(arg,'w')
elif opt in ("-f","--output_filtered_SAM"):
if arg!=None:
filtered_sam = True
filtered_sam_file = open(arg,'w')
else:
sys.stderr.write("!! No filtered sam output !!\n")
sys.exit(2)
else:
sys.stderr.write(f"Unknown option {format(opt)}\n")
usage()
sys.exit(2)
if not VCFFile:
VCFFile = sys.stdout
#Creates the VCF file and prints the header into it
PrintVCFHeader(VCFFile,listName,fileName,boolmyname)
nbGeno=CounterGenotype(fileName)
#---------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------
#Generates the field index (first occurrence of "contig", and so on)
fieldIndex=GetIndex(fileName)
#---------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------
#Start to read the file two lines by two lines
if ".sam" in fileName: #Checks if it's a stream_file
while True:
line1=stream_file.readline()
if not line1: break #End of file
line1 = line1.strip()
if line1.startswith('@'):
if filtered_sam:
filtered_sam_file.write(line1)
continue #We do not read headers
while True:
listline1=line1.split("\t")
if len(listline1) > 1 and int(listline1[1]) & 2048 :#checks if it's not a secondary alignment => means splitted aligned sequence
line1=stream_file.readline()
else: break
line2=stream_file.readline().strip() #Read couple of lines
while True:
listline2=line2.split("\t")
if len(listline2) > 1 and int(listline2[1]) & 2048 :#checks if it's not a secondary alignment => means splitted aligned sequence
line2=stream_file.readline()
if not line2: break # happens when second part of the previous pair (last one) add an unread secondary alignment
else: break
#Initializes variant object with the samline
#if InitVariant(line1,line2)[0]==1:
# continue
if not line2: break
variant_object, vcf_field_object=InitVariant(line1,line2,fileName, fieldIndex) #Fills the object with the line of the stream_file
if variant_object.CheckCoupleVariantID()==1: #Checks whether the two lines are from the same path
sys.exit(1)
#Checks the mapping on reference and determines the shift with the reference, which path is the reference ...
table=MappingTreatement(variant_object,vcf_field_object,nbGeno)
#Added by Pierre Peterlongo, Sept 2015. Outputs a new SAM file corresponding to mapped sequences (PASS or MULTIPLE)
if(filtered_sam and vcf_field_object.filterField!="."):
filtered_sam_file.write(line1)
filtered_sam_file.write(line2)
#Fills the VCF file
variant_object.FillVCF(VCFFile,nbGeno,table,vcf_field_object)
elif ".fa" in fileName: #Treatement of the fasta file (no mapping information)
while True:
line1=stream_file.readline()
if not line1: break #End of file
seq1=stream_file.readline() #Reads the seq associate to the variant
line2=stream_file.readline() #Reads a couple of line
seq2=stream_file.readline()
#Initializes variant object with the samline
variant_object, vcf_field_object=InitVariant(line1,line2,fileName, fieldIndex)
table=UnmappedTreatement(variant_object,vcf_field_object,nbGeno,seq1,seq2)
variant_object.FillVCF(VCFFile,nbGeno,table,vcf_field_object)
VCFFile.close()
stream_file.close()
if filtered_sam:
filtered_sam_file.close()
if __name__ == '__main__':
main()
sys.stderr.write(f"VCF file had been generated. Beware, this is a zero-based VCF file.\n")
sys.stderr.write(f"Please use 'zero2one.py' script if you wish to obtain a one-based VCF.\n")
|