File: run_blaster.py

package info (click to toggle)
python-cgecore 1.5.6%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 220 kB
  • sloc: python: 2,112; makefile: 6
file content (183 lines) | stat: -rwxr-xr-x 6,700 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
import sys
import os
import time
import random
import re
import subprocess

from argparse import ArgumentParser
from blaster import Blaster

##########################################################################
# PARSE COMMAND LINE OPTIONS
##########################################################################

parser = ArgumentParser()
parser.add_argument('-i', '--input',
                    help="Input file")
parser.add_argument('-l', '--input_list',
                    help="File with list of files to analyse")
parser.add_argument('-d', '--databases',
                    help="Comma seperated list of databases to blast against")
parser.add_argument('-o', '--out_path',
                    help="output directory")
parser.add_argument("-b", "--blastPath", dest="blast_path",
                    help="Path to blast",
                    default='blastn')
parser.add_argument("-p", "--databasePath", dest="db_path",
                    help="Path to the databases",
                    default='')
parser.add_argument("-c", "--min_cov", dest="min_cov",
                    help="Minimum coverage",
                    type=float,
                    default=0.60)
parser.add_argument("-t", "--threshold", dest="threshold",
                    help="Blast threshold for identity",
                    type=float,
                    default=0.90)
parser.add_argument("--overlap",
                    help=("Allow hits/genes to overlap with this number of "
                          "nucleotides. Default: 30."),
                    type=int,
                    default=30)

args = parser.parse_args()

##########################################################################
# MAIN
##########################################################################

min_cov = args.min_cov
threshold = args.threshold

# Check if valid input file is provided
input_list = []
if args.input is not None and args.input_list is not None:
   sys.exit("Input Error: Please only provide file list or single input "
            "file\n")

elif args.input is None and args.input_list is None:
   sys.exit("Input Error: No Input were provided!\n")

elif args.input is not None and not os.path.exists(args.input):
   sys.exit("Input Error: Input file does not exist!\n")

elif args.input is not None:
    inputfile = args.input
    input_list.append(inputfile)

elif args.input_list is not None and not os.path.exists(args.input_list):
   sys.exit("Input Error: No Input were provided!\n")

elif args.input_list is not None:
   inputfile = args.input_list
   with open(inputfile, "r") as f:
      for line in f:
         line = line.rstrip()
         input_list.append(line)

# Check if valid output directory is provided
if not os.path.exists(args.out_path):
   os.makedirs(args.out_path, exist_ok=True)
   out_path = args.out_path
else:
   out_path = args.out_path

# Check if valid file with genes is provided
if args.databases is None:
   sys.exit("Input Error: No databases sepcified!\n")
else:
   databases = args.databases.split(",")


# Check if valid path to BLAST is provided
blast = args.blast_path
db_path = args.db_path

for inp_file in input_list:
   # Calling blast and parsing output
   blast_run = Blaster(inputfile=inp_file, databases=databases,
                       db_path=db_path, out_path=out_path, min_cov=min_cov,
                       threshold=threshold, blast=blast,
                       allowed_overlap=args.overlap)

   results = blast_run.results
   query_align = blast_run.gene_align_query
   homo_align = blast_run.gene_align_homo
   sbjct_align = blast_run.gene_align_sbjct

   file_name = inp_file.split("/")[-1].split(".")[0]
   out_name = "%s/%s_hit_alignments.txt" % (out_path, file_name)
   txt_file_seq_text = dict()
   pos_result = list()

   # Make result file
   tab_file = "%s/%s_results.txt" % (out_path, file_name)
   tab = open(tab_file, "w")

   for db in results:

      if results[db] == "No hit found":
         tab.write("%s\n%s\n\n" % (db, results[db]))
      else:
         pos_result.append(db)
         tab.write("%s\n" % (db))
         tab.write("Hit\tIdentity\tAlignment Length/Gene Length\tPosition in "
                   "reference\tContig\tPosition in contig\n")

         txt_file_seq_text[db] = list()

         for hit in results[db]:
            header = results[db][hit]["sbjct_header"]
            ID = results[db][hit]["perc_ident"]
            sbjt_length = results[db][hit]["sbjct_length"]
            HSP = results[db][hit]["HSP_length"]
            positions_contig = "%s..%s" % (results[db][hit]["query_start"],
                                           results[db][hit]["query_end"])
            positions_ref = "%s..%s" % (results[db][hit]["sbjct_start"],
                                        results[db][hit]["sbjct_end"])
            contig_name = results[db][hit]["contig_name"]

            # Write tabels
            tab.write("%s\t%.2f\t%s/%s\t%s\t%s\t%s\n" % (header, ID, HSP,
                                                         sbjt_length,
                                                         positions_ref,
                                                         contig_name,
                                                         positions_contig))
            # Writing subjet/ref sequence
            ref_seq = sbjct_align[db][hit]
            hit_seq = query_align[db][hit]

            # Getting the header and text for the txt file output
            sbjct_start = results[db][hit]["sbjct_start"]
            sbjct_end = results[db][hit]["sbjct_end"]
            text = ("%s, ID: %.2f %%, Alignment Length/Gene Length: %s/%s, "
                    "Positions in reference: %s..%s, Contig name: %s, "
                    "Position: %s" % (header, ID, HSP, sbjt_length,
                                      sbjct_start, sbjct_end, contig_name,
                                      positions_contig)
                    )

            # Saving the output to print the txt result file allignemts
            txt_file_seq_text[db].append((text, ref_seq, homo_align[db][hit],
                                         hit_seq))

   tab.close()
   txt_file = open(out_name, "w")

   for db in pos_result:
      # Txt file alignments
      txt_file.write("##################### %s #####################\n" % (db))

      for text in txt_file_seq_text[db]:
         txt_file.write("%s\n\n" % (text[0]))

         for i in range(0, len(text[1]), 60):
            txt_file.write("%s\n" % (text[1][i:i + 60]))
            txt_file.write("%s\n" % (text[2][i:i + 60]))
            txt_file.write("%s\n\n" % (text[3][i:i + 60]))

         txt_file.write("\n")

   txt_file.close()