1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
|
#!/usr/bin/env perl
use strict;
use warnings;
my $usage = "usage: $0 sample_avg_expr.matrix edgeR_directory/ [FDR=0.05]\n\n";
my $sample_expr_matrix = $ARGV[0] or die $usage;
my $edgeR_dir = $ARGV[1] or die $usage;
my $MAX_FDR = $ARGV[2];
unless (defined $MAX_FDR) {
$MAX_FDR = 0.05;
}
my @DE_result_files = <$edgeR_dir/*.DE_results>;
unless (@DE_result_files) {
die "Error, cannot find \*.DE_results files at $edgeR_dir ";
}
main: {
my %gene_to_sample_expr_val = &parse_expression_matrix($sample_expr_matrix);
foreach my $DE_result_file (@DE_result_files) {
$DE_result_file =~ /\.([^\.\/]+)_vs_([^\.\/]+).edgeR.DE_result/ or die "Error, cannot parse filename: $DE_result_file";
my $sample_A = $1;
my $sample_B = $2;
open (my $fh, $DE_result_file) or die $!;
my $header = <$fh>;
while(<$fh>) {
chomp;
my @x = split(/\t/);
my $feature = $x[0];
my $FDR = $x[4];
if ($FDR <= $MAX_FDR) {
my $expr_sample_A = $gene_to_sample_expr_val{$feature}->{$sample_A};
my $expr_sample_B = $gene_to_sample_expr_val{$feature}->{$sample_B};
unless (defined $expr_sample_A && defined $expr_sample_B) {
die "Error, no expr value for feature: $feature, $sample_A [$expr_sample_A] or $sample_B [$expr_sample_B] " . Dumper($gene_to_sample_expr_val{$feature});
}
my $log_expr_sample_A = log($expr_sample_A+1)/log(2);
my $log_expr_sample_B = log($expr_sample_B+1)/log(2);
my $log_FC = sprintf("%.2f", $log_expr_sample_A - $log_expr_sample_B);
print join("\t", $feature, $sample_A, $sample_B, $log_expr_sample_A, $log_expr_sample_B, $log_FC, $FDR) . "\n";
}
}
}
exit(0);
}
####
sub parse_expression_matrix {
my ($expr_matrix_file) = @_;
print STDERR "\nReading matrix: $expr_matrix_file ... ";
my $num_lines = `wc -l $expr_matrix_file | cut -f1 -d ' '`;
chomp $num_lines;
print STDERR " $num_lines rows of matrix detected.\n\n";
my %gene_to_sample_expr_val;
open (my $fh, $expr_matrix_file) or die "Error, cannot open file $expr_matrix_file";
my $header = <$fh>;
chomp $header;
$header =~ s/^\s+//;
my @sample_names = split(/\t/, $header);
my $counter = 0;
while (<$fh>) {
chomp;
my @x = split(/\t/);
my $feature_name = shift @x;
unless (scalar @x == scalar @sample_names) {
die "Error, number of samples: " . scalar (@sample_names) . " doesn't match number of values read: " . scalar(@x) . " ";
}
for (my $i = 0; $i <= $#sample_names; $i++) {
my $sample = $sample_names[$i];
my $val = $x[$i];
$gene_to_sample_expr_val{$feature_name}->{$sample} = $val;
}
$counter++;
if ($counter % 10000 == 0) {
my $pct_done = sprintf("%.2f", $counter/$num_lines * 100);
print STDERR "\r[$pct_done %] matrix read. ";
}
}
close $fh;
print STDERR "\n\nDone reading matrix.\n";
return(%gene_to_sample_expr_val);
}
|