File: TransDecoder.Predict

package info (click to toggle)
transdecoder 3.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 11,356 kB
  • ctags: 274
  • sloc: perl: 5,454; sh: 81; makefile: 51
file content (452 lines) | stat: -rwxr-xr-x 13,114 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
#!/usr/bin/env perl

=pod

=head1 NAME

L<Transdecoder> - Transcriptome Protein Prediction

=head1 USAGE

Required:

 -t <string>                            transcripts.fasta

Common options:
 
 --retain_long_orfs <int>               retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence 
                                         marks it as coding (default: 900 bp => 300aa)

 --retain_pfam_hits <string>            domain table output file from running hmmscan to search Pfam (see transdecoder.github.io for info)     
                                        Any ORF with a pfam domain hit will be retained in the final output.
 
 --retain_blastp_hits <string>          blastp output in '-outfmt 6' format.
                                        Any ORF with a blast match will be retained in the final output.

 --single_best_orf                      Retain only the single best ORF per transcript.
                                        (Best is defined as having (optionally pfam and/or blast support) and longest orf)

 --cpu <int>                            Use multiple cores for cd-hit-est. (default=1)

 -G <string>                            genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia, ...)


Advanced options

 --train <string>                       FASTA file with ORFs to train Markov Mod for protein identification; otherwise 
                                        longest non-redundant ORFs used

 -T <int>                               If no --train, top longest ORFs to train Markov Model (hexamer stats) (default: 500)
                                        Note, 10x this value are first selected for use with cd-hit to remove redundancies,
                                        and then this -T value of longest ORFs are selected from the non-redundant set.

=cut

=head1 Genetic Codes

See L<http://golgi.harvard.edu/biolinks/gencode.html>. These are currently supported:

 universal (default)
 Euplotes
 Tetrahymena
 Candida
 Acetabularia
 Mitochondrial-Canonical
 Mitochondrial-Vertebrates
 Mitochondrial-Arthropods
 Mitochondrial-Echinoderms
 Mitochondrial-Molluscs
 Mitochondrial-Ascidians
 Mitochondrial-Nematodes
 Mitochondrial-Platyhelminths
 Mitochondrial-Yeasts
 Mitochondrial-Euascomycetes
 Mitochondrial-Protozoans


=cut
    



use strict;
use warnings;
use FindBin;
use Pod::Usage;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
use Data::Dumper;
use List::Util qw (min max);
use File::Basename;

use lib ("$FindBin::RealBin/PerlLib");

use POSIX qw(ceil);
use Gene_obj;
use Nuc_translator;
use Fasta_reader;
use Longest_orf;

my $UTIL_DIR = "$FindBin::RealBin/util";
$ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";


my ($cd_hit_est_exec) = &check_program('/usr/lib/cd-hit/cd-hit-est');

my ($transcripts_file,$train_file);

my $top_ORFs_train = 500;


my $help;

my $verbose;
my $search_pfam = "";
my ($reuse,$pfam_out);

my $RETAIN_LONG_ORFS = 900;


my $retain_pfam_hits_file;
my $retain_blastp_hits_file;
my $cpu = 1;
my $MPI_DEBUG = 1;
my $single_best_orf_flag = 0;
my $genetic_code = "";

&GetOptions( 't=s' => \$transcripts_file,
             'train:s' => \$train_file,

             'h' => \$help,
             'v' => \$verbose,
             
             'T=i' => \$top_ORFs_train,

             'search_pfam=s' => \$search_pfam,
             'reuse' => \$reuse,

             'retain_long_orfs=i' => \$RETAIN_LONG_ORFS,

             'debug' => \$MPI_DEBUG,
             
             'retain_pfam_hits=s' => \$retain_pfam_hits_file,
             'retain_blastp_hits=s' => \$retain_blastp_hits_file,
             'cpu=i' => \$cpu,

             'single_best_orf' => \$single_best_orf_flag,

             'G=s' => \$genetic_code,

             
             );



pod2usage(-verbose => 2, -output => \*STDERR) if ($help);

if (@ARGV) {
    die "Error, don't understand options: @ARGV";
}


our $SEE = $verbose;

pod2usage(-verbose => 2, -output => \*STDERR, -message => "No transcript file (-t)\n") unless ($transcripts_file && -s $transcripts_file);


if ($genetic_code) {
    $genetic_code = " --genetic_code $genetic_code";
}

main: {
    my $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
    
    unless (-d $workdir) {
        die "Error, cannot find directory: $workdir,  be sure to first run TransDecoder.LongOrfs before TransDecoder.Predict\n\n";
    }
        
    my $prefix = "$workdir/longest_orfs";
    my $cds_file = "$prefix.cds";
    my $gff3_file = "$prefix.gff3";
    my $pep_file = "$prefix.pep";

    ## Train a Markov model based on user-provided file or longest candidate CDS sequences, score all candidates, and select the final set.

    my $top_cds_file;
    if ($train_file) {

        if (! -s $train_file) {
            die "Error, cannot locate train file: $train_file";
        }
        $top_cds_file = $train_file;
    }
    else {
        $top_cds_file = "$cds_file.top_${top_ORFs_train}_longest";
        my $checkpoint = "$top_cds_file.ok";
        if (! -e $checkpoint) {
                    
            # to speed things up only check for redundancy up to x the number of entries we want
            my $red_num = $top_ORFs_train * 10;
            my $red_num_cds_longest_file = "$cds_file.top_longest_${red_num}";
            &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num > $red_num_cds_longest_file");
            &process_cmd("$cd_hit_est_exec -r 1 -i $red_num_cds_longest_file -T $cpu -c 0.80 -o $red_num_cds_longest_file.nr80 -M 0 ");
            &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr80 $top_ORFs_train > $top_cds_file");
        
            &process_cmd("touch $checkpoint");

        }
    }
    
    
    # get hexamer scores
    my $hexamer_scores_file = "$workdir/hexamer.scores";
    my $hexamer_checkpoint = "$hexamer_scores_file.ok";
    if (! -e $hexamer_checkpoint) {
        
        my $base_freqs_file = "$workdir/base_freqs.dat";

        my $cmd = "$UTIL_DIR/seq_n_baseprobs_to_logliklihood_vals.pl $top_cds_file $base_freqs_file > $hexamer_scores_file";
        &process_cmd($cmd);

        &process_cmd("touch $hexamer_checkpoint");
    }

    # score all cds entries
    my $cds_scores_file = "$cds_file.scores";
    my $cds_scores_checkpoint = "$cds_scores_file.ok";
    if (! -e $cds_scores_checkpoint) {
        my $cmd = "$UTIL_DIR/score_CDS_liklihood_all_6_frames.pl $cds_file $hexamer_scores_file > $cds_scores_file";
        &process_cmd($cmd);
        
        &process_cmd("touch $cds_scores_checkpoint");
    }

    ## Retain those that have pfam matches

    my %has_pfam_hit;
    
    if ($retain_pfam_hits_file) {
        %has_pfam_hit = &parse_pfam_hits($retain_pfam_hits_file);
    }

    my %has_blastp_hit;
    if ($retain_blastp_hits_file) {
        %has_blastp_hit = &parse_blastp_hits_file($retain_blastp_hits_file);
    }
    
    # get accs for best entries
    my $acc_file = "$cds_file.scores.selected";
    {
        
        my %att_counter;
        
        open (my $ofh, ">$acc_file") or die "Error, cannot write to $acc_file";
        open (my $ifh, "$cds_file.scores") or die "Error, cannot open file $cds_file.scores";
        while (<$ifh>) {
            chomp;
            my ($acc, $orf_length, @scores) = split(/\t/);

            my @ATTS;
            
            my $score_1 = shift @scores;
            my $max_score_other_frame = max(@scores);

            my $keep_flag = 0;

            if ($has_pfam_hit{$acc}) {
                $keep_flag = 1;
                push (@ATTS, "PFAM");
                print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
            }
            if ($has_blastp_hit{$acc}) {
                $keep_flag = 1;
                push (@ATTS, "BLASTP");
                print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
            }
            if ($orf_length >= $RETAIN_LONG_ORFS) {
                $keep_flag = 1;
                push (@ATTS, "LONGORF");
            }
            if ($score_1 > 0 && $score_1 > $max_score_other_frame) {
                $keep_flag = 1;
                push (@ATTS, "FRAMESCORE");
            }
            
            if ($keep_flag) {
                print $ofh "$acc\n";

                my $att_string = join("|", sort @ATTS);
                $att_counter{$att_string}++;
            }
        }
        close $ifh;
        close $ofh;
    

        # report on the categories of the selected ORFs
        print STDERR "\n#####################\nCounts of kept entries according to attributes:\n";
        foreach my $att (reverse sort {$att_counter{$a}<=>$att_counter{$b}} keys %att_counter) {
            my $count = $att_counter{$att};
            print STDERR join("\t", $att, $count) . "\n";
        }
        print STDERR "########################\n\n\n";
    }
    
    # index the current gff file:
    my $cmd = "$UTIL_DIR/index_gff3_files_by_isoform.pl $gff3_file";
    &process_cmd($cmd);
    
    # retrieve the best entries:
    $cmd = "$UTIL_DIR/gene_list_to_gff.pl $acc_file $gff3_file.inx > $cds_file.best_candidates.gff3";
    &process_cmd($cmd);
    
    
    ##############################
    ## Generate the final outputs.
    ##############################

    my $final_output_prefix = basename($transcripts_file) . ".transdecoder";
    
    {
                        
        # exclude shadow orfs (smaller orfs in different reading frame that are eclipsed by longer orfs)
        $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $cds_file.eclipsed_removed.gff3";

        my $target_final_file = "$cds_file.eclipsed_removed.gff3";
        
        &process_cmd($cmd);

        
        if ($single_best_orf_flag) {
            $cmd = "$UTIL_DIR/single_best_ORF_per_transcript.pl ";
            if ($retain_blastp_hits_file) {
                $cmd .= " --blast_hits $retain_blastp_hits_file ";
            }
            if ($retain_pfam_hits_file) {
                $cmd .= " --pfam_hits $retain_pfam_hits_file ";
            }
            $cmd .= " --gff3_file $cds_file.eclipsed_removed.gff3 > $cds_file.single_best_orf.gff3";
            
            &process_cmd($cmd);

            $target_final_file = "$cds_file.single_best_orf.gff3";
        }

        
        &process_cmd("cp $target_final_file $final_output_prefix.gff3");
                
        ## write final outputs:
        
        ## make a BED file for viewing in IGV
        my $gff3_file = "$final_output_prefix.gff3";
        my $bed_file = $gff3_file;
        $bed_file =~ s/\.gff3$/\.bed/;
        $cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
        &process_cmd($cmd);
        
    
        # make a peptide file:
        my $best_pep_file = $gff3_file;
        $best_pep_file =~ s/\.gff3$/\.pep/;
        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl --gff3 $gff3_file --fasta $transcripts_file $genetic_code > $best_pep_file";
        &process_cmd($cmd);
        
        

        # make a CDS file:
        my $best_cds_file = $best_pep_file;
        $best_cds_file =~ s/\.pep$/\.cds/;
        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl --gff3 $gff3_file --fasta $transcripts_file --seqType CDS $genetic_code > $best_cds_file";
        &process_cmd($cmd);
        
    }
    
    print STDERR "transdecoder is finished.  See output files $final_output_prefix.\*\n\n\n";
    
    
    
    exit(0);
}


####
sub process_cmd {
	my ($cmd) = @_;

	print "CMD: $cmd\n";
	my $ret = system($cmd);

	if ($ret) {
		die "Error, cmd: $cmd died with ret $ret";
	}
	
	return;

}



####
sub parse_pfam_hits {
    my ($pfam_hits_file) = @_;
    
    my %has_pfam_hit;
    
    if (! -e $pfam_hits_file) {
        die "Error, cannot find pfam hits file: $pfam_hits_file";
    }
    
    print "PFAM output found and processing...\n";
    # capture those proteins having pfam hits
    open (my $fh, $pfam_hits_file) or die "Error, cannot open file: $pfam_hits_file";
    while (my $ln=<$fh>) {
        next if $ln=~/^\#/;
        my @x = split(/\s+/,$ln);
        next unless $x[3];  # domtbl
        my $orf_acc = $x[3];
        $has_pfam_hit{$orf_acc} = 1;
    }
    close $fh;
    
    
    return(%has_pfam_hit);
}

####
sub parse_blastp_hits_file {
    my ($blastp_file) = @_;

    unless (-e $blastp_file) {
        die "Error, cannot find file $blastp_file";
    }

    my %blastp_hits;

    open (my $fh, $blastp_file) or die "Error, cannot open file $blastp_file";
    while (<$fh>) {
        chomp;
        my @x = split(/\t/);
        my $id = $x[0];

        $blastp_hits{$id} = 1;
    }
    close $fh;

    return(%blastp_hits);
}




sub check_program() {
    my @paths;
    foreach my $prog (@_) {
        my $path = `which $prog`;
        unless ($path =~ /\w/) {
            die "Error, path to a required program ($prog) cannot be found\n\n"
        }
        chomp($path);
        $path = readlink($path) if -l $path;
        push( @paths, $path );
    }
    return @paths;
}