File: remove_batch_effects_from_count_matrix.pl

package info (click to toggle)
trinityrnaseq 2.15.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 468,004 kB
  • sloc: perl: 49,905; cpp: 17,993; java: 12,489; python: 3,282; sh: 1,989; ansic: 985; makefile: 717; xml: 62
file content (141 lines) | stat: -rwxr-xr-x 3,350 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env perl

use strict;
use warnings;
use Carp;
use Getopt::Long qw(:config no_ignore_case bundling);
use Cwd;
use File::Basename;
use lib ("/usr/lib/trinityrnaseq/PerlLib");
use Fasta_reader;
use Data::Dumper;


my $usage = <<__EOUSAGE__;


#################################################################################################
#
#  Required:
#
#  --matrix|m <string>               matrix of raw read counts (not normalized!)
#
#  --batches_file|b <string>         tab-delimited text file indicating biological replicate relationships.
#                                   ex.
#                                        batch_1    cond_A_rep1
#                                        batch_1    cond_B_rep1
#
#                                        batch_2    cond_A_rep2
#                                        batch_2    cond_B_rep2
#
#
################################################################################################



__EOUSAGE__


    ;


my $matrix_file;
my $batches_file;
my $help_flag;

&GetOptions ( 'h' => \$help_flag,
              'matrix|m=s' => \$matrix_file,              
              'batches_file|b=s' => \$batches_file,
              
    );



if ($help_flag) {
    die $usage;
}


unless ($matrix_file 
        && $batches_file
    ) { 
    
    die $usage;
    
}

main: {
            
    &run_edgeR_remove_batch_effect($matrix_file, $batches_file);
    
    
    exit(0);
}


####
sub run_edgeR_remove_batch_effect {
    my ($matrix_file, $batches_file) = @_;

    use File::Basename;
    my $base = basename($batches_file);
    my $Rscript_name = "$base.batch_eff_removal.Rscript";
    my $out_matrix_name = basename($matrix_file) . ".batch_eff_removal.matrix";

    ## write R-script to run edgeR
    open (my $ofh, ">$Rscript_name") or die "Error, cannot write to $Rscript_name";
    
    print $ofh "library(edgeR)\n";

    print $ofh "\n";
    
    print $ofh "rnaseqMatrix = read.table(\"$matrix_file\", header=T, row.names=1, com='', check.names=F)\n";
    print $ofh "batches = read.table(\"$batches_file\", header=F, row.names=2, check.names=F)\n";
    print $ofh "batch_factors = as.factor(batches[colnames(rnaseqMatrix),])\n";
    
    print $ofh "exp_study = DGEList(counts=rnaseqMatrix)\n";
    print $ofh "exp_study = calcNormFactors(exp_study)\n";
    print $ofh "logCPM <- cpm(exp_study,log=TRUE)\n";
    print $ofh "logCPM <- removeBatchEffect(logCPM,batch=batch_factors)\n";
    
    print $ofh "millions = colSums(rnaseqMatrix)/1e6\n";
    
    print $ofh "myCPM = 2^logCPM\n";
    print $ofh "myRecounts = apply(myCPM, 1, function (x) x*millions)\n";
    print $ofh "write.table(t(myRecounts), file=\'$out_matrix_name\', quote=F, sep='\t')\n";
        
    
    close $ofh;

    ## Run R-script
    my $cmd = "R --no-save --no-restore --no-site-file --no-init-file -q < $Rscript_name";


    eval {
        &process_cmd($cmd);
    };
    if ($@) {
        print STDERR "$@\n\n";
        print STDERR "\n\nWARNING: This EdgeR comparison failed...\n\n";
        ## if this is due to data paucity, such as in small batch data sets, then ignore for now.
    }
    

    return;
}



####
sub process_cmd {
    my ($cmd) = @_;

    print "CMD: $cmd\n";
    my $ret = system($cmd);

    if ($ret) {
        die "Error, cmd: $cmd died with ret ($ret) ";
    }

    return;
}