File: blaster.py

package info (click to toggle)
python-cgecore 1.5.6%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 220 kB
  • sloc: python: 2,112; makefile: 6
file content (639 lines) | stat: -rwxr-xr-x 28,089 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
#!/usr/bin/env python3
from __future__ import division
import sys
import os
import time
import random
import re
import subprocess

from Bio.Blast import NCBIXML
from Bio import SeqIO

import collections


class Blaster():
    def __init__(self, inputfile, databases, db_path, out_path='', min_cov=0.6,
                 threshold=0.9, blast='blastn', cut_off=True,
                 max_target_seqs=50000, reuse_results=False,
                 allowed_overlap=0):

        min_cov = 100 * float(min_cov)
        threshold = 100 * float(threshold)

        # For alignment data storage
        self.gene_align_query = dict()  # Sequence alignment lines
        self.gene_align_homo = dict()  # Sequence alignment homolog string
        self.gene_align_sbjct = dict()  # Sequence alignment allele string
        self.results = dict()  # Results

        # TODO: Add excluded results to this dictionay
        self.results["excluded"] = dict()

        for db in databases:
            # Adding the path to the database and output
            db_file = "%s/%s.fsa" % (db_path, db)
            tmp_out_path = "%s/tmp" % (out_path)
            out_file = "%s/out_%s.xml" % (tmp_out_path, db)
            os.makedirs(tmp_out_path, exist_ok=True)
            os.chmod(tmp_out_path, 0o775)

            # Running blast
            if (os.path.isfile(out_file)
                    and os.access(out_file, os.R_OK)
                    and reuse_results):
                print("Found " + out_file + " skipping DB.")
                out, err = (b'', b'')
            else:
                cmd = ("%s -subject %s -query %s -out %s -outfmt 5"
                       " -perc_identity  %s -max_target_seqs %s"
                       " -dust no" % (blast, db_file, inputfile,
                                        out_file, threshold, max_target_seqs))

                process = subprocess.Popen(cmd, shell=True,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
                out, err = process.communicate()

            # Get results file
            try:
                 # Test if output file exist
                 result_handle = open(out_file, "r")
            except IOError:
                 sys.exit(("Error: BLAST did not run as expected. "
                           "The expected output file, {}, did not exist.\n"
                           "BLAST finished with the following response:"
                           "\n{}\n{}").format(os.path.abspath(out_file),
                                              out.decode("utf-8"),
                                              err.decode("utf-8")))

            # Test if blast output is empty
            if os.stat(out_file).st_size == 0:
                 sys.exit(("Error: BLAST did not run as expected. "
                           "The output file {} was empty.\n"
                           "BLAST finished with the following response:"
                           "\n{}\n{}").format(os.path.abspath(out_file),
                                              out.decode("utf-8"),
                                              err.decode("utf-8")))

            # Get blast output
            blast_records = NCBIXML.parse(result_handle)

            # Declaring variables for saving the results

            # results for each gene
            gene_results = dict()

            # For finding the best hits
            best_hsp = dict()

            # Keeping track of gene split
            gene_split = collections.defaultdict(dict)

            # Making the dicts for sequence outputs
            self.gene_align_query[db] = dict()
            self.gene_align_homo[db] = dict()
            self.gene_align_sbjct[db] = dict()

            # Parsing over the hits and only keeping the best
            for blast_record in blast_records:
                # OLD CODE TO BE REMOVED
                #  query = blast_record.query
                #  blast_record.alignments.sort(key=lambda align: (
                #     -max((len(hsp.query)
                #            * (int(hsp.identities) / float(len(hsp.query)))
                #            for hsp in align.hsps)))
                #  )

                # Sort BLAST alignments by the hsp in each alignment with the
                # highest number of identical nucleotides (hsp.identities)
                blast_record.alignments.sort(
                    key=lambda align: (max((int(hsp.identities)
                                           for hsp in align.hsps))),
                    reverse=True)
                query = blast_record.query

                for alignment in blast_record.alignments:
                    # Setting the e-value as 1 and bit as 0 to get the best
                    # HSP fragment
                    best_e_value = 1
                    best_bit = 0
                    start_hsp = 0
                    end_hsp = 0

                    for hsp in alignment.hsps:
                        if hsp.expect < best_e_value or hsp.bits > best_bit:
                            # best_e_value = hsp.expect
                            # best_bit = hsp.bits
                            tmp = alignment.title.split(" ")
                            sbjct_header = tmp[1]
                            # DEBUG
                            print("Found: {}".format(sbjct_header))
                            bit = hsp.bits
                            sbjct_length = alignment.length
                            sbjct_start = hsp.sbjct_start
                            sbjct_end = hsp.sbjct_end
                            gaps = hsp.gaps
                            query_string = str(hsp.query)
                            homo_string = str(hsp.match)
                            sbjct_string = str(hsp.sbjct)
                            contig_name = query.replace(">", "")
                            query_start = hsp.query_start
                            query_end = hsp.query_end
                            HSP_length = len(query_string)
                            perc_ident = (int(hsp.identities)
                                          / float(HSP_length) * 100)
                            strand = 0
                            coverage = ((int(HSP_length) - int(gaps))
                                        / float(sbjct_length))
                            perc_coverage = (((int(HSP_length) - int(gaps))
                                             / float(sbjct_length)) * 100)

                            # cal_score is later used to select the best hit
                            cal_score = perc_ident * coverage

                            hit_id = "%s:%s..%s:%s:%f" % (
                                contig_name, query_start, query_end,
                                sbjct_header, cal_score)
                            # If the hit is on the other strand
                            if sbjct_start > sbjct_end:
                                tmp = sbjct_start
                                sbjct_start = sbjct_end
                                sbjct_end = tmp

                                query_string = self.reversecomplement(
                                    query_string)
                                homo_string = homo_string[::-1]
                                sbjct_string = self.reversecomplement(
                                    sbjct_string)
                                strand = 1

                            # Save hit
                            if((cut_off and perc_coverage > 20)
                               or cut_off is False):

                                best_hsp = {'evalue': hsp.expect,
                                            'sbjct_header': sbjct_header,
                                            'bit': bit,
                                            'perc_ident': perc_ident,
                                            'sbjct_length': sbjct_length,
                                            'sbjct_start': sbjct_start,
                                            'sbjct_end': sbjct_end,
                                            'gaps': gaps,
                                            'query_string': query_string,
                                            'homo_string': homo_string,
                                            'sbjct_string': sbjct_string,
                                            'contig_name': contig_name,
                                            'query_start': query_start,
                                            'query_end': query_end,
                                            'HSP_length': HSP_length,
                                            'coverage': coverage,
                                            'cal_score': cal_score,
                                            'hit_id': hit_id,
                                            'strand': strand,
                                            'perc_coverage': perc_coverage
                                            }
                            else:
                                continue



                        # Saving the result if any
                        if best_hsp:
                            save = 1

                            # If there are other gene alignments they are compared
                            if gene_results:
                                tmp_gene_split = gene_split
                                tmp_results = gene_results
                                # Compare the hit results
                                save, gene_split, gene_results = (
                                    self.compare_results(save, best_hsp,
                                                         tmp_results,
                                                         tmp_gene_split,
                                                         allowed_overlap)
                                )

                            # If the hit is not overlapping with other hit
                            # seqeunces it is kept
                            if save == 1:
                                # DEBUG
                                print("Saving: {}".format(hit_id))
                                gene_results[hit_id] = best_hsp

            result_handle.close()

            # If the hit does not cover the entire database reference the
            # missing seqence data are extracted
            keys = list(gene_results.keys())

            for hit_id in keys:
                hit = gene_results[hit_id]

                # Calculate possible split gene coverage
                perc_coverage = hit['perc_coverage']

                if(hit['sbjct_header'] in gene_split
                    and len(gene_split[hit['sbjct_header']]) > 1):

                    # Calculate new length
                    new_length = self.calculate_new_length(gene_split, gene_results,
                                                                        hit)
                    hit['split_length'] = new_length

                    # Calculate new coverage
                    perc_coverage = new_length / float(hit['sbjct_length']) * 100

                # If the hit is above the minimum length threshold it is kept
                if perc_coverage >= min_cov:
                    if hit['coverage'] == 1:
                        self.gene_align_query[db][hit_id] = hit['query_string']
                        self.gene_align_homo[db][hit_id] = hit['homo_string']
                        self.gene_align_sbjct[db][hit_id] = hit['sbjct_string']
                    elif hit['coverage'] != 1:
                        # Getting the whole database sequence
                        for seq_record in SeqIO.parse(db_file, "fasta"):
                            if seq_record.description.replace(" ", "") == hit['sbjct_header'].replace(" ", ""):
                                start_seq = str(seq_record.seq)[:int(hit["sbjct_start"])-1]
                                end_seq = str(seq_record.seq)[int(hit["sbjct_end"]):]
                                self.gene_align_sbjct[db][hit_id] = start_seq + hit['sbjct_string'] + end_seq
                                #self.gene_align_sbjct[db][hit_id] = str(seq_record.seq)
                                break

                        # Getting the whole contig to extract extra query seqeunce
                        contig = ''
                        for seq_record in SeqIO.parse(inputfile, "fasta"):
                            if seq_record.description.replace(" ", "") == hit['contig_name'].replace(" ", ""):
                                contig = str(seq_record.seq)
                                break

                        # Extract extra sequence from query
                        query_seq, homo_seq = self.get_query_align(hit, contig)

                        # Saving the new alignment sequences
                        self.gene_align_query[db][hit_id] = query_seq
                        self.gene_align_homo[db][hit_id] = homo_seq

                else:
                    del gene_results[hit_id]
                    if hit['sbjct_header'] in gene_split:
                        del gene_split[hit['sbjct_header']]

            # Save the database result
            if gene_results:
                self.results[db] = gene_results
            else:
                self.results[db] = "No hit found"

    @staticmethod
    def reversecomplement(seq):
        # Make reverse complement strand
        trans = str.maketrans("ATGC", "TACG")
        return seq.translate(trans)[::-1]

    @staticmethod
    def compare_results(save, best_hsp, tmp_results, tmp_gene_split,
                        allowed_overlap):
        """
            Function for comparing hits and saving only the best hit
        """

        # Get data for comparison
        hit_id = best_hsp['hit_id']
        new_start_query = best_hsp['query_start']
        new_end_query = best_hsp['query_end']
        new_start_sbjct = int(best_hsp['sbjct_start'])
        new_end_sbjct = int(best_hsp['sbjct_end'])
        new_score = best_hsp['cal_score']
        new_db_hit = best_hsp['sbjct_header']
        new_contig = best_hsp['contig_name']
        new_HSP = best_hsp['HSP_length']

        # See if the best HSP fragment overlap with another allignment
        # and keep the allignment with the highest score - if the new
        # fragment is not providing new sequence
        keys = list(tmp_results.keys())
        for hit in keys:
            hit_data = tmp_results[hit]
            old_start_query = hit_data['query_start']
            old_end_query = hit_data['query_end']
            old_start_sbjct = int(hit_data['sbjct_start'])
            old_end_sbjct = int(hit_data['sbjct_end'])
            old_score = hit_data['cal_score']
            old_db_hit = hit_data['sbjct_header']
            old_contig = hit_data['contig_name']
            old_HSP = hit_data['HSP_length']

            remove_old = 0
            # If they align to the same gene in the database they are
            # compared
            if new_db_hit == old_db_hit:

                #If the hit comes from different contig
                if old_contig != new_contig:

                    # Save a split if the new hit still creats one
                    if(new_db_hit in tmp_gene_split
                       and hit_id not in tmp_gene_split[new_db_hit]):

                        tmp_gene_split[new_db_hit][hit_id] = 1

                # If the hit provides additional sequence it is kept and
                # the new coverage is saved otherwise the one with the
                # highest score is kept
                elif(new_start_sbjct < old_start_sbjct
                   or new_end_sbjct > old_end_sbjct):

                    # Save the hits as split
                    tmp_gene_split[old_db_hit][hit_id] = 1
                    if hit not in tmp_gene_split[old_db_hit]:
                        tmp_gene_split[old_db_hit][hit] = 1


#                else:
#                    if new_score > old_score:
                        # Set to remove old hit
#                        remove_old = 1

                        # Save a split if the new hit still creats one
#                        if(new_db_hit in tmp_gene_split
#                           and hit_id not in tmp_gene_split[new_db_hit]):

#                            tmp_gene_split[new_db_hit][hit_id] = 1
#                    else:
#                        save = 0

                        # If the old and new hit is not identical the
                        # possible saved gene split for the new hit is
                        # removed
#                        if hit_id != hit:
#                            if(new_db_hit in tmp_gene_split
#                               and hit_id in tmp_gene_split[new_db_hit]):

#                                del tmp_gene_split[new_db_hit][hit_id]
#                        break

            # If the hits comes form the same part of the contig
            # sequnce but match different genes only the best hit is
            # kept
            if new_contig == old_contig:
                print("Same contig: {} == {}".format(new_contig, old_contig))
                print("\t{} vs {}".format(new_db_hit, old_db_hit))
                # Check if saved hits overlaps with current hit
                hit_union_length = (max(old_end_query, new_end_query)
                                    - min(old_start_query, new_start_query))
                hit_lengths_sum = ((old_end_query - old_start_query)
                                   + (new_end_query - new_start_query))
                overlap_len = (hit_lengths_sum - hit_union_length)

                if overlap_len < allowed_overlap:
                    print("\tignore overlap ({}): {}".format(overlap_len, new_db_hit))
                    continue
                print("\toverlap found ({}): {}".format(overlap_len, new_db_hit))

                # If the two hits cover the exact same place on the
                # contig only the percentage of identity is compared
                if(old_start_query == new_start_query
                   and old_end_query == new_end_query):

                    if best_hsp['perc_ident'] > hit_data['perc_ident']:

                        # Set to remove old hit
                        remove_old = 1

                        # Save a split if the new hit still creats one
                        if(new_db_hit in tmp_gene_split
                           and hit_id not in tmp_gene_split[new_db_hit]):

                            tmp_gene_split[new_db_hit][hit_id] = 1

                    elif best_hsp['perc_ident'] == hit_data['perc_ident']:
                        # Save both

                        # Save a split if the new hit still creats one
                        if(new_db_hit in tmp_gene_split
                           and hit_id not in tmp_gene_split[new_db_hit]):

                            tmp_gene_split[new_db_hit][hit_id] = 1
                    else:
                        save = 0

                        # Remove new gene from gene split if present
                        if(new_db_hit in tmp_gene_split
                           and hit_id in tmp_gene_split[new_db_hit]):

                            del tmp_gene_split[new_db_hit][hit_id]
                        break

                # If new hit overlaps with the saved hit
                elif(hit_union_length <= hit_lengths_sum):
                    print("\t{} <= {}".format(hit_union_length, hit_lengths_sum))
                    print("\t\tScores: {} cmp {}".format(new_score, old_score))

                    if new_score > old_score:
                        # Set to remove old gene
                        remove_old = 1
                        # Save a split if the new hit still creats one
                        if(new_db_hit in tmp_gene_split
                           and hit_id not in tmp_gene_split[new_db_hit]):

                            tmp_gene_split[new_db_hit][hit_id] = 1

                    elif new_score == old_score:
                        # If both genes are of same coverage
                        # and identity is the same implied by new_score == old_score
                        # hit is chosen based on length
                        if((int(best_hsp['perc_coverage']) ==
                             int(hit_data['perc_coverage']))
                             and new_HSP > old_HSP):

                            # Set to remove old gene
                            remove_old = 1
                        elif((int(best_hsp['perc_coverage']) ==
                                int(hit_data['perc_coverage']))
                                and old_HSP > new_HSP):

                            # Remove current hit
                            save = 0
                        elif((int(best_hsp['perc_coverage']) ==
                                int(hit_data['perc_coverage']))
                                and old_HSP==new_HSP):
                            # Both hits has same coverage, and same identity
                            # and same length, how to choose only one hit?
                            pass

                        # TODO
                        # If new_score == old_score but identity and coverages are not the same.
                        # which gene should be chosen?? Now they are both keept.

                        # Save a split if the new hit creats one - both
                        # hits are saved
                        if(new_db_hit in tmp_gene_split
                            and hit_id not in tmp_gene_split[new_db_hit]):

                            tmp_gene_split[new_db_hit][hit_id] = 1
                    else:
                        # Remove new gene from gene split if present
                        if(new_db_hit in tmp_gene_split
                            and hit_id in tmp_gene_split[new_db_hit]):

                            del tmp_gene_split[new_db_hit][hit_id]

                        save = 0
                        break

            # Remove old hit if new hit is better
            if remove_old == 1:
                del tmp_results[hit]

                # Remove gene from gene split if present
                if(old_db_hit in tmp_gene_split
                    and hit in tmp_gene_split[old_db_hit]):

                    del tmp_gene_split[old_db_hit][hit]

        return save, tmp_gene_split, tmp_results

    @staticmethod
    def calculate_new_length(gene_split, gene_results, hit):
        """
            Function for calcualting new length if the gene is split on
            several contigs
        """
        # Looping over splitted hits and calculate new length
        first = 1
        for split in gene_split[hit['sbjct_header']]:

            new_start = int(gene_results[split]['sbjct_start'])
            new_end = int(gene_results[split]['sbjct_end'])

            # Get the first HSP
            if first == 1:
                new_length = int(gene_results[split]['HSP_length'])
                old_start = new_start
                old_end = new_end
                first = 0
                continue
            if new_start < old_start:
                new_length = new_length + (old_start - new_start)
                old_start = new_start

            if new_end > old_end:
                new_length = new_length + (new_end - old_end)
                old_end = new_end

        return(new_length)

    @staticmethod
    def get_query_align(hit, contig):
        """
            Function for extracting extra seqeunce data to the query
            alignment if the full reference length are not covered
        """

        # Getting data needed to extract sequences
        query_seq = hit['query_string']
        homo_seq = hit['homo_string']
        sbjct_start = int(hit['sbjct_start'])
        sbjct_end = int(hit['sbjct_end'])
        query_start = int(hit['query_start'])
        query_end = int(hit['query_end'])
        length = int(hit['sbjct_length'])

        # If the alignment doesn't start at the first position data is
        # added to the begnning
        if sbjct_start != 1:
            missing = sbjct_start - 1

            if(query_start >= missing and hit['strand'] != 1
                or hit['strand'] == 1 and missing <= (len(contig) - query_end)):

                # Getting the query sequence.
                # If the the hit is on the other strand the characters
                # are reversed.
                if hit['strand'] == 1:
                    start_pos = query_end
                    end_pos = query_end + missing
                    chars = contig[start_pos:end_pos]
                    chars = Blaster.reversecomplement(chars)
                else:
                    start_pos = query_start - missing - 1
                    end_pos = query_start - 1
                    chars = contig[start_pos:end_pos]

                query_seq = chars + str(query_seq)
            else:
                # Getting the query sequence.
                # If the the hit is on the other strand the characters
                # are reversed.
                if hit['strand'] == 1:
                    if query_end == len(contig):
                        query_seq = "-" * missing + str(query_seq)
                    else:
                        start_pos = query_end
                        chars = contig[start_pos:]
                        chars = Blaster.reversecomplement(chars)

                        query_seq = ("-" * (missing - len(chars))
                                         + chars + str(query_seq))
                elif query_start < 3:
                    query_seq = "-" * missing + str(query_seq)
                else:
                    end_pos = query_start - 2
                    chars = contig[0:end_pos]

                    query_seq = ("-" * (missing - len(chars))
                                     + chars + str(query_seq))

            # Adding to the homo sequence
            spaces = " " * missing
            homo_seq = str(spaces) + str(homo_seq)

        # If the alignment dosen't end and the last position data is
        # added to the end
        if sbjct_end < length:
            missing = length - sbjct_end

            if(missing <= (len(contig) - query_end) and hit['strand'] != 1
                or hit['strand'] == 1 and query_start >= missing):

                # Getting the query sequence.
                # If the the hit is on the other strand the characters
                # are reversed.
                if hit['strand'] == 1:
                    start_pos = query_start - missing - 1
                    end_pos = query_start - 1
                    chars = contig[start_pos:end_pos]
                    chars = Blaster.reversecomplement(chars)
                else:
                    start_pos = query_end
                    end_pos = query_end + missing
                    chars = contig[start_pos:end_pos]

                query_seq = query_seq + chars
            else:
                # If the hit is on the other strand the characters are reversed
                if hit['strand'] == 1:
                    if query_start < 3:
                        query_seq = query_seq + "-" * missing
                    else:
                        end_pos = query_start - 2
                        chars = contig[0:end_pos]
                        chars = Blaster.reversecomplement(chars)

                        query_seq = (query_seq
                                         + chars + "-" * (missing - len(chars)))
                elif query_end == len(contig):
                    query_seq = query_seq + "-" * missing
                else:
                    start_pos = query_end
                    chars = contig[start_pos:]

                    query_seq = query_seq + chars + "-" * (missing - len(chars))

            # Adding to the homo sequence
            spaces = " " * int(missing)
            homo_seq = str(homo_seq) + str(spaces)

        return query_seq, homo_seq