File: count_N50_given_MIN_FPKM_threshold.pl

package info (click to toggle)
trinityrnaseq 2.11.0%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 417,528 kB
  • sloc: perl: 48,420; cpp: 17,749; java: 12,695; python: 3,124; sh: 1,030; ansic: 983; makefile: 688; xml: 62
file content (84 lines) | stat: -rwxr-xr-x 1,723 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env perl

use strict;
use warnings;


my $usage = "\n\nusage: $0 RSEM.genes.results\n\n";

my $rsem_file = $ARGV[0] or die $usage;

main: {

    my @entries;

    open (my $fh, $rsem_file) or die $!;
    my $header = <$fh>;
    my $max_fpkm = 0;
    while (<$fh>) {
        chomp;
        my @x = split(/\t/);
        my $len = $x[2];
        my $fpkm = $x[6];
        
        push (@entries, { len => $len,
                          fpkm => $fpkm,
                      });
    
        
        if ($fpkm > $max_fpkm) {
            $max_fpkm = $fpkm;
        }
    }
    close $fh;

    print join("\t", "min_fpkm", "cum_seq_len", "partial_sum_len", "N50_len", "num_entries") . "\n";
    ;
    &N50(0, \@entries);
    
    my $min_fpkm = 0.01;
    
    while ($min_fpkm < $max_fpkm) {
        
        @entries = grep { $_->{fpkm} >= $min_fpkm } @entries;
        
        &N50($min_fpkm, \@entries);
        
        $min_fpkm *= 2;
    }

}

sub N50 {
    my ($min_fpkm, $entries_aref) = @_;

    my @entries = @$entries_aref;
    @entries = reverse sort {$a->{len}<=>$b->{len}} @entries;
    
    my $num_entries = scalar(@entries);

    my $cum_seq_len = 0;
    foreach my $entry (@entries) {
        $cum_seq_len += $entry->{len};
    }
    
    my $half_cum_len = $cum_seq_len / 2;
    
    my $n50_len = "NA";
    my $partial_sum_len = 0;
    foreach my $entry (@entries) {
        my $len = $entry->{len};
        $partial_sum_len += $len;
        
        if ($partial_sum_len >= $half_cum_len) {
            $n50_len = $len;
            last;
        }
    }
    
    
    print join("\t", $min_fpkm, int($cum_seq_len+0.5), int($partial_sum_len+0.5), $n50_len, $num_entries) . "\n";

    return;
}