File: summ_domain_ident.pl

package info (click to toggle)
fasta3 36.3.8i.14-Nov-2020-3
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 7,016 kB
  • sloc: ansic: 77,269; perl: 10,677; python: 2,461; sh: 428; csh: 86; sql: 55; makefile: 40
file content (96 lines) | stat: -rwxr-xr-x 3,042 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/perl

################################################################
# copyright (c) 2014 by William R. Pearson and The Rector &
# Visitors of the University of Virginia */
################################################################
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under this License is distributed on an "AS
# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied.  See the License for the specific language
# governing permissions and limitations under the License. 
################################################################

# summ_domain_ident.pl takes a -m 8CC result file with query-annotated
# domains, and produces a tab-delimited summary of identity across the domains
# parse:
# sp|P09488|GSTM1_HUMAN	gi|121735|sp|P09488.3|GSTM1_HUMAN	100.00	218	0	0	1	218	1	218	2.9e-113	408.2	218M	|RX:1-12:1-12:s=64;b=25.0;I=1.000;Q=47.5;C=exon_1|RX:13-37:13-37:s=128;b=49.9;I=1.000;Q=121.4;C=exon_2|RX:38-59:38-59:s=125;b=48.7;I=1.000;Q=117.9;C=exon_3|RX:60-86:60-86:s=145;b=56.5;I=1.000;Q=141.0;C=exon_4|RX:87-120:87-120:s=185;b=72.1;I=1.000;Q=187.2;C=exon_5|RX:121-152:121-152:s=174;b=67.8;I=1.000;Q=174.5;C=exon_6|RX:153-189:153-189:s=197;b=76.8;I=1.000;Q=201.0;C=exon_7|RX:190-218:190-218:s=151;b=58.9;I=1.000;Q=147.9;C=exon_8

use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;

my ($shelp, $help) = (0, 0);

GetOptions(
    "h|?" => \$shelp,
    "help" => \$help,
    );

pod2usage(1) if $shelp;
pod2usage(exitstatus => 0, verbose => 2) if $help;
pod2usage(1) unless @ARGV;

my $first_line =1;

my @a_field_names = qw( score_field bits id qval comment );
my @domain_names = ();

while (my $line = <>) {
  next if $line =~ m/^#/;
  chomp($line);
  
  # get last (annotation) field

  my @fields = split(/\t/,$line);
  $fields[1] =~ s/^gi\|\d+\|//;
  $fields[1] =~ s/\.\d+\|/\|/;

  my @annots = split(/\|/,$fields[-1]);
  shift @annots; # first is blank

  # $annots[...]:
  # RX:1-12:1-12:s=64;b=25.0;I=1.000;Q=47.5;C=exon_1

  my %dom_ids = ();

  for my $annot (@annots) {
    my %a_fields = ();
    @a_fields{@a_field_names} = split(/;/,$annot);
    $a_fields{'id'} =~ s/^I=//;
    $a_fields{'comment'} =~ s/^C=//;

    $dom_ids{$a_fields{'comment'}} = $a_fields{'id'};

    if ($first_line) {
      push @domain_names, $a_fields{'comment'};
    }
  }
  if ($first_line) {
    print join("\t",("subj_acc          ","ident",@domain_names)),"\n";
    $first_line = 0;
  }

  for my $dom ( @domain_names ) {
    if (defined($dom_ids{$dom})) {
      if (100.0 * $dom_ids{$dom} >= $fields[2]) {
	$dom_ids{$dom} .= '+';
      }
      else {
	$dom_ids{$dom} .= '-';
      }
    }
    else {
      $dom_ids{$dom} = '';
    }
  }

  print join("\t",($fields[1],$fields[2],@dom_ids{@domain_names})),"\n";
}