File: gmap_gff3_to_percent_length_stats.pl

package info (click to toggle)
trinityrnaseq 2.11.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 417,528 kB
  • sloc: perl: 48,420; cpp: 17,749; java: 12,695; python: 3,124; sh: 1,030; ansic: 983; makefile: 688; xml: 62
file content (75 lines) | stat: -rwxr-xr-x 1,856 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env perl

use strict;
use warnings;

use lib ("/usr/lib/trinityrnaseq/PerlLib");
use Fasta_reader;

my $usage = "usage: $0 gmap.gff3 transcripts.fasta\n\n";

my $gmap_gff3 = $ARGV[0] or die $usage;
my $trans_fa = $ARGV[1] or die $usage;


main: {

    my %trans_coords_matched;

    open (my $fh, $gmap_gff3) or die $!;
    while (<$fh>) {
        if (/^\#/) { next; }
        chomp;
        my @x = split(/\t/);
        my $info = $x[8];
        
        $info =~ /Target=(\S+) (\d+) (\d+)/ or die "Error, cannot parse transcript and coordinates from $info";

        my $acc = $1;
        my $lend = $2;
        my $rend = $3;
        my $per_id = $x[5];

        push (@{$trans_coords_matched{$acc}}, [$lend, $rend, $per_id]);
        
    }
    close $fh;


    my $fasta_reader = new Fasta_reader($trans_fa);
    my %trans_seqs = $fasta_reader->retrieve_all_seqs_hash();
    
    foreach my $trans_acc (keys %trans_seqs) {

        my $trans_length = length($trans_seqs{$trans_acc}) or die "Error, no trans seq for $trans_acc";

        unless (exists $trans_coords_matched{$trans_acc}) {
            print join("\t", $trans_acc, 0, $trans_length, 0, 0) . "\n";
            next;
        }
        
        ## compute length and percent identity
        my @coords = @{$trans_coords_matched{$trans_acc}};

        my $match_len = 0;
        my $sum_per_id_len = 0;
        foreach my $coordset (@coords) {
            my ($lend, $rend, $per_id) = @$coordset;
            my $len = $rend - $lend + 1;
            $match_len += $len;
            $sum_per_id_len += $len * $per_id;
        }
        my $avg_per_id = $sum_per_id_len / $match_len;
        

        my $per_length = sprintf("%.2f", $match_len / $trans_length * 100);

        print "$trans_acc\t$match_len\t$trans_length\t$per_length\t$avg_per_id\n";
    }


    exit(0);
}