File: blast_top_hits.pl

package info (click to toggle)
seer 1.1.4-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,692 kB
  • sloc: cpp: 2,945; perl: 596; python: 122; makefile: 93; sh: 43
file content (130 lines) | stat: -rwxr-xr-x 3,799 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/perl -w

use strict;
use warnings;

use Getopt::Long;
use Text::CSV;

# Features
# BLAST and BLAT
# Exact/full query matches only
#

my $clean = 0;
my $tmp_sort = "tmp_blat.psl";

my $help_message = <<HELP;
Usage: ./blast_top_hits.pl --input blast_out --mode blast <options>

Reduces a blast or blat output file to the top hit for each query sequence
only

   Options

   Required
   --input              BLAST output file in -m 6/-outfmt 6 or BLAT .psl file
   --mode               Either blast or blat

   Optional
   --full_query        The entire query must be reported as a match to appear
                       in the output

   -h, --help          Shows this help.

HELP
#****************************************************************************************#
#* Main                                                                                 *#
#****************************************************************************************#

#* gets input parameters
my ($file_in, $mode, $full_query, $help);
GetOptions( "input|f=s" => \$file_in,
            "mode=s" => \$mode,
            "full_query" => \$full_query,
            "help|h" => \$help
		   ) or die($help_message);

# Basic error check on inputs
if (defined($help))
{
   print STDERR $help_message;
}
elsif (!defined($file_in) || !-e $file_in)
{
   print STDERR "Input file not found\n";
   print STDERR $help_message;
}
elsif (!defined($mode))
{
   print STDERR "Operation mode must be either blast or blat\n";
   print STDERR $help_message;
}
else
{
   my (@columns, $qseqid, $qstart, $qend, $length);
   if ($mode =~ /blat/i)
   {
      # psl format
      # match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStarts  tStarts
      #         match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
      @columns = qw( match mismatch repmatch Ns Qgapcount Qgapbases Tgapcount Tgapbases strand Qname Qsize Qstart Qend Tname Tsize Tstart Tend blockcount blocksizes qstarts tstarts );

      $mode = "blat";
      $qseqid = "Qname";
      $qstart = "Qstart";
      $qend = "Qend";
      $length = "match";

      # Ensure sorted by query, then match score
      system("sort -k10,10 -k1,1nr $file_in > $tmp_sort");

      $file_in = $tmp_sort;
      $clean = 1;
   }
   elsif ($mode =~ /blast/i)
   {
      # blast outfmt 6
      #qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
      @columns = qw( qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore );

      $mode = "blast";
      $qseqid = "qseqid";
      $qstart = "qstart";
      $qend = "qend";
      $length = "length";
   }
   else
   {
      die("Operation mode $mode not supported. Use either blast or blat\n");
   }

   my $csv = Text::CSV->new ( { binary => 1, sep_char => "\t", auto_diag => 1, eol => $/ } )  # should set binary attribute.
                 or die "Cannot use CSV: ".Text::CSV->error_diag ();
   my %rec;
   $csv->bind_columns (\@rec{@columns});

   open my $blast_in, "<:encoding(utf8)", "$file_in" or die "$file_in: $!";

   my $previous_query = "";
   while ($csv->getline($blast_in))
   {
      # Using speed advice from http://www.perlmonks.org/?node_id=937023
      if ($rec{$qseqid} ne $previous_query)
      {
         if (!$full_query || ((($mode eq "blast" && $rec{$qstart} == 1) || $rec{$qstart} == 0) && $rec{$qend} == $rec{$length}))
         {
            $csv->print(*STDOUT, [ @rec{@columns} ]);
         }
         $previous_query = $rec{$qseqid};
      }
   }

   if ($clean)
   {
      unlink $file_in;
   }
}

exit(0);