1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
|
#!/usr/bin/env python
import sys
import os
import time
import random
import re
import subprocess
from argparse import ArgumentParser
from blaster import Blaster
##########################################################################
# PARSE COMMAND LINE OPTIONS
##########################################################################
parser = ArgumentParser()
parser.add_argument('-i', '--input',
help="Input file")
parser.add_argument('-l', '--input_list',
help="File with list of files to analyse")
parser.add_argument('-d', '--databases',
help="Comma seperated list of databases to blast against")
parser.add_argument('-o', '--out_path',
help="output directory")
parser.add_argument("-b", "--blastPath", dest="blast_path",
help="Path to blast",
default='blastn')
parser.add_argument("-p", "--databasePath", dest="db_path",
help="Path to the databases",
default='')
parser.add_argument("-c", "--min_cov", dest="min_cov",
help="Minimum coverage",
type=float,
default=0.60)
parser.add_argument("-t", "--threshold", dest="threshold",
help="Blast threshold for identity",
type=float,
default=0.90)
parser.add_argument("--overlap",
help=("Allow hits/genes to overlap with this number of "
"nucleotides. Default: 30."),
type=int,
default=30)
args = parser.parse_args()
##########################################################################
# MAIN
##########################################################################
min_cov = args.min_cov
threshold = args.threshold
# Check if valid input file is provided
input_list = []
if args.input is not None and args.input_list is not None:
sys.exit("Input Error: Please only provide file list or single input "
"file\n")
elif args.input is None and args.input_list is None:
sys.exit("Input Error: No Input were provided!\n")
elif args.input is not None and not os.path.exists(args.input):
sys.exit("Input Error: Input file does not exist!\n")
elif args.input is not None:
inputfile = args.input
input_list.append(inputfile)
elif args.input_list is not None and not os.path.exists(args.input_list):
sys.exit("Input Error: No Input were provided!\n")
elif args.input_list is not None:
inputfile = args.input_list
with open(inputfile, "r") as f:
for line in f:
line = line.rstrip()
input_list.append(line)
# Check if valid output directory is provided
if not os.path.exists(args.out_path):
os.makedirs(args.out_path, exist_ok=True)
out_path = args.out_path
else:
out_path = args.out_path
# Check if valid file with genes is provided
if args.databases is None:
sys.exit("Input Error: No databases sepcified!\n")
else:
databases = args.databases.split(",")
# Check if valid path to BLAST is provided
blast = args.blast_path
db_path = args.db_path
for inp_file in input_list:
# Calling blast and parsing output
blast_run = Blaster(inputfile=inp_file, databases=databases,
db_path=db_path, out_path=out_path, min_cov=min_cov,
threshold=threshold, blast=blast,
allowed_overlap=args.overlap)
results = blast_run.results
query_align = blast_run.gene_align_query
homo_align = blast_run.gene_align_homo
sbjct_align = blast_run.gene_align_sbjct
file_name = inp_file.split("/")[-1].split(".")[0]
out_name = "%s/%s_hit_alignments.txt" % (out_path, file_name)
txt_file_seq_text = dict()
pos_result = list()
# Make result file
tab_file = "%s/%s_results.txt" % (out_path, file_name)
tab = open(tab_file, "w")
for db in results:
if results[db] == "No hit found":
tab.write("%s\n%s\n\n" % (db, results[db]))
else:
pos_result.append(db)
tab.write("%s\n" % (db))
tab.write("Hit\tIdentity\tAlignment Length/Gene Length\tPosition in "
"reference\tContig\tPosition in contig\n")
txt_file_seq_text[db] = list()
for hit in results[db]:
header = results[db][hit]["sbjct_header"]
ID = results[db][hit]["perc_ident"]
sbjt_length = results[db][hit]["sbjct_length"]
HSP = results[db][hit]["HSP_length"]
positions_contig = "%s..%s" % (results[db][hit]["query_start"],
results[db][hit]["query_end"])
positions_ref = "%s..%s" % (results[db][hit]["sbjct_start"],
results[db][hit]["sbjct_end"])
contig_name = results[db][hit]["contig_name"]
# Write tabels
tab.write("%s\t%.2f\t%s/%s\t%s\t%s\t%s\n" % (header, ID, HSP,
sbjt_length,
positions_ref,
contig_name,
positions_contig))
# Writing subjet/ref sequence
ref_seq = sbjct_align[db][hit]
hit_seq = query_align[db][hit]
# Getting the header and text for the txt file output
sbjct_start = results[db][hit]["sbjct_start"]
sbjct_end = results[db][hit]["sbjct_end"]
text = ("%s, ID: %.2f %%, Alignment Length/Gene Length: %s/%s, "
"Positions in reference: %s..%s, Contig name: %s, "
"Position: %s" % (header, ID, HSP, sbjt_length,
sbjct_start, sbjct_end, contig_name,
positions_contig)
)
# Saving the output to print the txt result file allignemts
txt_file_seq_text[db].append((text, ref_seq, homo_align[db][hit],
hit_seq))
tab.close()
txt_file = open(out_name, "w")
for db in pos_result:
# Txt file alignments
txt_file.write("##################### %s #####################\n" % (db))
for text in txt_file_seq_text[db]:
txt_file.write("%s\n\n" % (text[0]))
for i in range(0, len(text[1]), 60):
txt_file.write("%s\n" % (text[1][i:i + 60]))
txt_file.write("%s\n" % (text[2][i:i + 60]))
txt_file.write("%s\n\n" % (text[3][i:i + 60]))
txt_file.write("\n")
txt_file.close()
|