1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
|
#!/usr/bin/env perl
use Getopt::Long;
use Pod::Usage;
use File::Basename;
use FindBin;
use lib $FindBin::RealBin;
use rsem_perl_utils qw(runCommand collectResults showVersionInfo);
use Env qw(@PATH);
@PATH = ($FindBin::RealBin, "$FindBin::RealBin/sam", @PATH);
use strict;
use warnings;
#const
my $status = 0;
my $bowtie_path = "";
my $nThreads = 1;
my $quiet = 0;
my $help = 0;
my $keep_intermediate_files = 0;
my $version = 0;
my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ('') x 7;
my $chipseq_target_read_files = '';
my $chipseq_control_read_files = '';
my $chipseq_peak_file = '';
my $partition_model = 'pk';
my $chipseq_read_files_multi_targets = ''; ## read files for multiple targets
## delimited by comma
my $chipseq_bed_files_multi_targets = ''; ## BED files for multiple targets
## delimited by comma
GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
"temporary-folder=s" => \$temp_dir,
"bowtie-path=s" => \$bowtie_path,
"p|num-threads=i" => \$nThreads,
'chipseq-target-read-files=s' => \$chipseq_target_read_files,
## delimited by comma if more than one
'chipseq-control-read-files=s' => \$chipseq_control_read_files,
## delimited by comma if more than one
'chipseq-read-files-multi-targets=s' => \$chipseq_read_files_multi_targets,
## delimited by comma
'chipseq-bed-files-multi-targets=s' => \$chipseq_bed_files_multi_targets,
## delimited by comma
'chipseq-peak-file=s' => \$chipseq_peak_file,
'partition-model=s' => \$partition_model,
"version" => \$version,
"q|quiet" => \$quiet,
"h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
pod2usage(-verbose => 2) if ($help == 1);
&showVersionInfo($FindBin::RealBin) if ($version == 1);
#check parameters and options
{
my $msg = '';
if ( ( $chipseq_peak_file eq '' ) &&
( ( $chipseq_target_read_files eq '' ) ||
( $chipseq_control_read_files eq '' ) ||
( $bowtie_path eq '' ) ) &&
( ( $chipseq_read_files_multi_targets eq '' ) ||
( $bowtie_path eq '' ) ) &&
( $chipseq_bed_files_multi_targets eq '' )
) {
$msg = "please define one set of the following options to run pRSEM's testing procedure:\n" .
"1. --chipseq-peak-file <file>\n" .
"2. --chipseq-target-read-files <file> and\n" .
" --chipseq-control-read-files <file> and\n" .
" --bowtie-path <path>\n" .
"3. --chipseq-read-files-multi-targets <files> and\n" .
" --bowtie-path <path>\n" .
"4. --chipseq-bed-files-multi-targets <files>\n";
}
my @prsem_partition_models = ( 'pk', 'cmb_lgt' );
my %prtmdl2one = ();
foreach my $prtmdl (@prsem_partition_models) {
$prtmdl2one{$prtmdl} = 1;
}
if ( exists $prtmdl2one{$partition_model} ) {
if ( ( $partition_model eq 'cmb_lgt' ) &&
( ( $chipseq_read_files_multi_targets eq '' ) &&
( $chipseq_bed_files_multi_targets eq '' ) ) ){
$msg = 'either --chipseq-read-files-multi-targets <files> or ' .
'--chipseq-bed-files-multi-targets <files> needs to be ' .
"defined for pRSEM's partition model: '$partition_model'";
} elsif ( ( $partition_model ne 'pk' ) &&
( $partition_model ne 'cmb_lgt' ) &&
( ( $chipseq_target_read_files eq '' ) ||
( $chipseq_control_read_files eq '' ) ||
( $bowtie_path eq '' ) ) ){
$msg = '--chipseq-target-read-files <file> and ' .
'--chipseq-control-read-files <file> and ' .
'--bowtie-path <path> need to be defined for ' .
"pRSEM's partition model: '$partition_model'";
}
} else {
my $s_prt_mdls = join(', ', @prsem_partition_models);
$msg = "\n--partition-model <string> must be one of [$s_prt_mdls]\n" .
"pRSEM's testing procedure only supports the above partition models";
}
if ( $msg ne '' ) {
pod2usage(-msg => "$msg\n", -exitval => 2, -verbose => 2);
}
if ( ( $partition_model ne 'cmb_lgt' ) &&
( ( $chipseq_read_files_multi_targets ne '' ) ||
( $chipseq_bed_files_multi_targets ne '' ) ) ) {
print "\nCombining signals from multiple sources, partition model is set to 'cmb_lgt'\n\n";
$partition_model = 'cmb_lgt';
}
}
$refName = $ARGV[0];
$sampleName = $ARGV[1];
my $pos = rindex($sampleName, '/');
if ($pos < 0) { $sampleToken = $sampleName; }
else { $sampleToken = substr($sampleName, $pos + 1); }
if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
$stat_dir = "$sampleName.stat";
if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
$imdName = "$temp_dir/$sampleToken";
$statName = "$stat_dir/$sampleToken";
if ($bowtie_path ne "") { $bowtie_path .= "/"; }
my $command = "";
{
$command = "$FindBin::RealBin/pRSEM/prsem-testing-procedure " .
" --num-threads $nThreads " .
" --partition-model $partition_model ";
## ChIP-seq peak file from single source
if ( $chipseq_peak_file ne '') { ## only for partition model pk
## need to add sanity check!!
$command .= " --chipseq-peak-file $chipseq_peak_file";
} elsif ( $partition_model eq 'cmb_lgt' ) { ## multi-sources
if ( $chipseq_bed_files_multi_targets ne '' ) { ## use bed over read
$command .= ' --chipseq-bed-files-multi-targets ' .
$chipseq_bed_files_multi_targets;
} elsif ( $chipseq_read_files_multi_targets ne '' ) {
$command .= ' --chipseq-read-files-multi-targets ' .
$chipseq_read_files_multi_targets .
" --bowtie-path $bowtie_path" ;
}
} else { ## ChIP-seq reads files from single source
$command .= " --chipseq-target-read-files $chipseq_target_read_files " .
" --bowtie-path $bowtie_path" ;
if ( $chipseq_control_read_files ne '' ) {
$command .= " --chipseq-control-read-files $chipseq_control_read_files";
}
}
if ( $quiet ) {
$command .= ' --quiet ';
}
$command .= " $refName $sampleName $statName $imdName";
&runCommand($command);
}
if (!$keep_intermediate_files) {
&runCommand("rm -rf $temp_dir", "Fail to delete the temporary folder!");
}
__END__
=head1 NAME
rsem-run-prsem-testing-procedure
=head1 SYNOPSIS
rsem-run-prsem-testing-procedure [options] reference_name sample_name
=head1 ARGUMENGS
=over
=item B<reference_name>
The name of the reference used. Users must run 'rsem-prepare-reference' with this reference_name and with '--prep-pRSEM' specified before running this program.
=item B<sample_name>
The name of the sample analyzed. Users must run 'rsem-calculate-expression' with this sample_name and with RSEM's default Gibbs sampling performed before running this program.
=back
=head1 BASIC OPTIONS
=over
=item B<--bowtie-path> <string>
The path to the Bowtie executables. (Default: the path to the Bowtie executables is assumed to be in the user's PATH environment variable)
=item B<--chipseq-target-read-files> <string>
Comma-separated full path of FASTQ read file(s) for ChIP-seq target. This option provides information to calculate ChIP-seq peaks and signals. The file(s) can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie-path <path>' and '--chipseq-control-read-files <string>' must be defined when this option is specified. (Default: "")
=item B<--chipseq-control-read-files> <string>
Comma-separated full path of FASTQ read file(s) for ChIP-seq conrol. This option provides information to call ChIP-seq peaks. The file(s) can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. The options '--bowtie-path <path>' and '--chipseq-target-read-files <string>' must be defined when this option is specified. (Default: "")
=item B<--chipseq-read-files-multi-targets> <string>
Comma-separated full path of FASTQ read files for multiple ChIP-seq targets. This option is used when prior is learned from multiple complementary data sets. It provides information to calculate ChIP-seq signals. All files can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--bowtie-path <path>' must be defined and the option '--partition-model <string>' will be set to 'cmb_lgt' automatically. (Default: "")
=item B<--chipseq-bed-files-multi-targets> <string>
Comma-separated full path of BED files for multiple ChIP-seq targets. This option is used when prior is learned from multiple complementary data sets. It provides information of ChIP-seq signals and must have at least the first six BED columns. All files can be either ungzipped or gzipped with a suffix '.gz' or '.gzip'. When this option is specified, the option '--partition-model <string>' will be set to 'cmb_lgt' automatically. (Default: "")
=item B<--chipseq-peak-file> <string>
Full path to a ChIP-seq peak file in ENCODE's narrowPeak, i.e. BED6+4 format. This file is used in prior-enhanced RSEM's default two-partition model. It partitions isoforms by whether they have ChIP-seq overlapping with their transcription start site region or not. Each partition will have its own prior parameter learned from a training set. This file can be either gzipped or ungzipped. (Default: "")
=item B<--partition-model> <string>
A keyword to specify the partition model. It must be either 'pk' or 'cmb_lgt'. For details, please see the help document of 'rsem-calculate-expression'.
=item B<-p|--num-threads> <int>
Number of threads to use. (Default: 1)
=item B<--version>
Show version information.
=item B<-q|--quiet>
Suppress the output of logging information. (Default: off)
=item B<-h|--help>
Show help information.
=back
=head1 ADVANCED OPTIONS
=over
=item B<--keep-intermediate-files>
Keep temporary files generated by RSEM and this testing procedure. RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. By default, after this test is finished, the temporary directory is deleted. Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
=item B<--temporary-folder> <string>
Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
=back
=head1 DESCRIPTION
This program provides users a p-value and a log-likelihood to determine whether external data set(s) is informative and how informative it is for RNA-seq quantification. It is used in conjunction with prior-enhanced RSEM to let user select the most effective external data set(s).
Users can run this program repetitively with different external data. All p-values and log-likelihoods will be saved in an output file 'sample_name.pval_LL'.
=head1 NOTES
Users must run 'rsem-prepare-reference' with the appropriate referece and with the option '--prep-pRSEM' before using this program
Users must run 'rsem-calculate-expression' with the option '--calc-pme' before using this program
The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified.
=head1 OUTPUT
=over
=item B<sample_name.pval_LL>
This file contains partition model's name, basename(s) of external data set file(s), p-value, and log-likelihood delimited by tab. When this program is ran repetiively, output will be concatenated to the end of this file without removing previous results.
=back
The following output files are the same as the ones generated by 'rsem-calculate-expression' with prior-enhanced RSEM. Please refer to the help document of 'rsem-calculate-expression' for details
=over 2
=item B<sample_name.stat/sample_name.all_tr_features>
=item B<sample_name.stat/sample_name.all_tr_prior>
=item B<sample_name.stat/sample_name.lgt_mdl.RData>
=item B<sample_name.stat/sample_name.pval_LL>
=item B<sample_name.stat/sample_name_uniform_prior_1.isoforms.results>
=item B<sample_name.stat/sample_name_uniform_prior_1.genes.results>
=back
=head1 EXAMPLE
Assuming RSEM reference files are under '/ref' with name 'mouse_125' and expression files are under '/expr' with name 'mouse_125'. Suppose we want to derive prior from four histone modification ChIP-seq read data sets: '/data/H3K27Ac.fastq.gz', '/data/H3K4me1.fastq.gz', '/data/H3K4me2.fastq.gz', and '/data/H3K4me3.fastq.gz'. Also, assuming Bowtie's executables are under '/sw/bowtie/' and we want to use 16 cores:
rsem-run-prsem-testing-procedure --partition-model cmb_lgt \
--chipseq-read-files-multi-targets /data/H3K27Ac.fastq.gz,/data/H3K4me1.fastq.gz,/data/H3K4me2.fastq.gz,/data/H3K4me3.fastq.gz \
--bowtie-path /sw/bowtie \
--num-threads 16 \
/ref/mouse_125 \
/expr/mouse_125
=cut
|