File: trinity_trans_matrix_to_rep_trans_gene_matrix.pl

package info (click to toggle)
trinityrnaseq 2.11.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 417,528 kB
  • sloc: perl: 48,420; cpp: 17,749; java: 12,695; python: 3,124; sh: 1,030; ansic: 983; makefile: 688; xml: 62
file content (82 lines) | stat: -rwxr-xr-x 1,726 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env perl

use strict;
use warnings;

## sums up reads per gene, but uses the single isoform with the highest sum counts as the representative entry in the reported 'gene' matrix.

my $usage = "usage: $0 trinity_trans_matrix >  trinity_rep_trans_matrix\n\n";

my $trans_matrix_file = $ARGV[0] or die $usage;

main: {

    open(my $fh, $trans_matrix_file) or die $!;
    my $header = <$fh>;

    print $header;

    my %data;
    while (<$fh>) {
        chomp;
        my $line = $_;
        my @x = split(/\t/);
        my $acc = $x[0];

        #c1000023_g1_i3

        my $gene_id;
        if ($acc =~ /^([\w_]+_g\d+)_i\d+/) {
            $gene_id = $1;
        }
        unless ($gene_id) {
            die "Error, cannot extract gene identifier for $acc";
        }

        push (@{$data{$gene_id}}, $line);
    }
    close $fh;

    foreach my $acc (keys %data) {
        my @lines = @{$data{$acc}};
        if (scalar @lines > 0) {
            my $rep_line = &get_rep(@lines);
            print "$rep_line\n";
        }
        else {
            my $rep_line = shift @lines;
            print "$rep_line\n";
        }
    }

    exit(0);
}

###
sub get_rep {
    my @lines = @_;

    my @vals;
    my $best_rowsum = -1;
    my $best_acc = "";
    
    foreach my $line (@lines) {
        my @x = split(/\t/, $line);
        my $acc = shift @x;
        my $sum = 0;
        for (my $i = 0; $i <= $#x; $i++) {
            my $val = $x[$i];
            $sum += $val;
            $vals[$i] += $val;
        }
        if ($sum > $best_rowsum) {
            $best_rowsum = $sum;
            $best_acc = $acc;
            
        }
    }
    
    my $line = join("\t", $best_acc, @vals);
    return($line);
}