File: TransDecoder.Predict

package info (click to toggle)
transdecoder 3.0.1%2Bdfsg-1
links: PTS, VCS
area: main
in suites: stretch
size: 11,356 kB
ctags: 274
sloc: perl: 5,454; sh: 81; makefile: 51
file content (452 lines) | stat: -rwxr-xr-x 13,114 bytes
#!/usr/bin/env perl

=pod

=head1 NAME

L<Transdecoder> - Transcriptome Protein Prediction

=head1 USAGE

Required:

 -t <string>                            transcripts.fasta

Common options:
 
 --retain_long_orfs <int>               retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence 
                                         marks it as coding (default: 900 bp => 300aa)

 --retain_pfam_hits <string>            domain table output file from running hmmscan to search Pfam (see transdecoder.github.io for info)     
                                        Any ORF with a pfam domain hit will be retained in the final output.
 
 --retain_blastp_hits <string>          blastp output in '-outfmt 6' format.
                                        Any ORF with a blast match will be retained in the final output.

 --single_best_orf                      Retain only the single best ORF per transcript.
                                        (Best is defined as having (optionally pfam and/or blast support) and longest orf)

 --cpu <int>                            Use multiple cores for cd-hit-est. (default=1)

 -G <string>                            genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia, ...)


Advanced options

 --train <string>                       FASTA file with ORFs to train Markov Mod for protein identification; otherwise 
                                        longest non-redundant ORFs used

 -T <int>                               If no --train, top longest ORFs to train Markov Model (hexamer stats) (default: 500)
                                        Note, 10x this value are first selected for use with cd-hit to remove redundancies,
                                        and then this -T value of longest ORFs are selected from the non-redundant set.

=cut

=head1 Genetic Codes

See L<http://golgi.harvard.edu/biolinks/gencode.html>. These are currently supported:

 universal (default)
 Euplotes
 Tetrahymena
 Candida
 Acetabularia
 Mitochondrial-Canonical
 Mitochondrial-Vertebrates
 Mitochondrial-Arthropods
 Mitochondrial-Echinoderms
 Mitochondrial-Molluscs
 Mitochondrial-Ascidians
 Mitochondrial-Nematodes
 Mitochondrial-Platyhelminths
 Mitochondrial-Yeasts
 Mitochondrial-Euascomycetes
 Mitochondrial-Protozoans


=cut
    



use strict;
use warnings;
use FindBin;
use Pod::Usage;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
use Data::Dumper;
use List::Util qw (min max);
use File::Basename;

use lib ("$FindBin::RealBin/PerlLib");

use POSIX qw(ceil);
use Gene_obj;
use Nuc_translator;
use Fasta_reader;
use Longest_orf;

my $UTIL_DIR = "$FindBin::RealBin/util";
$ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";


my ($cd_hit_est_exec) = &check_program('/usr/lib/cd-hit/cd-hit-est');

my ($transcripts_file,$train_file);

my $top_ORFs_train = 500;


my $help;

my $verbose;
my $search_pfam = "";
my ($reuse,$pfam_out);

my $RETAIN_LONG_ORFS = 900;


my $retain_pfam_hits_file;
my $retain_blastp_hits_file;
my $cpu = 1;
my $MPI_DEBUG = 1;
my $single_best_orf_flag = 0;
my $genetic_code = "";

&GetOptions( 't=s' => \$transcripts_file,
             'train:s' => \$train_file,

             'h' => \$help,
             'v' => \$verbose,
             
             'T=i' => \$top_ORFs_train,

             'search_pfam=s' => \$search_pfam,
             'reuse' => \$reuse,

             'retain_long_orfs=i' => \$RETAIN_LONG_ORFS,

             'debug' => \$MPI_DEBUG,
             
             'retain_pfam_hits=s' => \$retain_pfam_hits_file,
             'retain_blastp_hits=s' => \$retain_blastp_hits_file,
             'cpu=i' => \$cpu,

             'single_best_orf' => \$single_best_orf_flag,

             'G=s' => \$genetic_code,

             
             );



pod2usage(-verbose => 2, -output => \*STDERR) if ($help);

if (@ARGV) {
    die "Error, don't understand options: @ARGV";
}


our $SEE = $verbose;

pod2usage(-verbose => 2, -output => \*STDERR, -message => "No transcript file (-t)\n") unless ($transcripts_file && -s $transcripts_file);


if ($genetic_code) {
    $genetic_code = " --genetic_code $genetic_code";
}

main: {
    my $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
    
    unless (-d $workdir) {
        die "Error, cannot find directory: $workdir,  be sure to first run TransDecoder.LongOrfs before TransDecoder.Predict\n\n";
    }
        
    my $prefix = "$workdir/longest_orfs";
    my $cds_file = "$prefix.cds";
    my $gff3_file = "$prefix.gff3";
    my $pep_file = "$prefix.pep";

    ## Train a Markov model based on user-provided file or longest candidate CDS sequences, score all candidates, and select the final set.

    my $top_cds_file;
    if ($train_file) {

        if (! -s $train_file) {
            die "Error, cannot locate train file: $train_file";
        }
        $top_cds_file = $train_file;
    }
    else {
        $top_cds_file = "$cds_file.top_${top_ORFs_train}_longest";
        my $checkpoint = "$top_cds_file.ok";
        if (! -e $checkpoint) {
                    
            # to speed things up only check for redundancy up to x the number of entries we want
            my $red_num = $top_ORFs_train * 10;
            my $red_num_cds_longest_file = "$cds_file.top_longest_${red_num}";
            &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num > $red_num_cds_longest_file");
            &process_cmd("$cd_hit_est_exec -r 1 -i $red_num_cds_longest_file -T $cpu -c 0.80 -o $red_num_cds_longest_file.nr80 -M 0 ");
            &process_cmd("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr80 $top_ORFs_train > $top_cds_file");
        
            &process_cmd("touch $checkpoint");

        }
    }
    
    
    # get hexamer scores
    my $hexamer_scores_file = "$workdir/hexamer.scores";
    my $hexamer_checkpoint = "$hexamer_scores_file.ok";
    if (! -e $hexamer_checkpoint) {
        
        my $base_freqs_file = "$workdir/base_freqs.dat";

        my $cmd = "$UTIL_DIR/seq_n_baseprobs_to_logliklihood_vals.pl $top_cds_file $base_freqs_file > $hexamer_scores_file";
        &process_cmd($cmd);

        &process_cmd("touch $hexamer_checkpoint");
    }

    # score all cds entries
    my $cds_scores_file = "$cds_file.scores";
    my $cds_scores_checkpoint = "$cds_scores_file.ok";
    if (! -e $cds_scores_checkpoint) {
        my $cmd = "$UTIL_DIR/score_CDS_liklihood_all_6_frames.pl $cds_file $hexamer_scores_file > $cds_scores_file";
        &process_cmd($cmd);
        
        &process_cmd("touch $cds_scores_checkpoint");
    }

    ## Retain those that have pfam matches

    my %has_pfam_hit;
    
    if ($retain_pfam_hits_file) {
        %has_pfam_hit = &parse_pfam_hits($retain_pfam_hits_file);
    }

    my %has_blastp_hit;
    if ($retain_blastp_hits_file) {
        %has_blastp_hit = &parse_blastp_hits_file($retain_blastp_hits_file);
    }
    
    # get accs for best entries
    my $acc_file = "$cds_file.scores.selected";
    {
        
        my %att_counter;
        
        open (my $ofh, ">$acc_file") or die "Error, cannot write to $acc_file";
        open (my $ifh, "$cds_file.scores") or die "Error, cannot open file $cds_file.scores";
        while (<$ifh>) {
            chomp;
            my ($acc, $orf_length, @scores) = split(/\t/);

            my @ATTS;
            
            my $score_1 = shift @scores;
            my $max_score_other_frame = max(@scores);

            my $keep_flag = 0;

            if ($has_pfam_hit{$acc}) {
                $keep_flag = 1;
                push (@ATTS, "PFAM");
                print STDERR "-$acc flagged as having a pfam domain.\n" if $verbose;
            }
            if ($has_blastp_hit{$acc}) {
                $keep_flag = 1;
                push (@ATTS, "BLASTP");
                print STDERR "-$acc flagged as having a blastp match.\n" if $verbose;
            }
            if ($orf_length >= $RETAIN_LONG_ORFS) {
                $keep_flag = 1;
                push (@ATTS, "LONGORF");
            }
            if ($score_1 > 0 && $score_1 > $max_score_other_frame) {
                $keep_flag = 1;
                push (@ATTS, "FRAMESCORE");
            }
            
            if ($keep_flag) {
                print $ofh "$acc\n";

                my $att_string = join("|", sort @ATTS);
                $att_counter{$att_string}++;
            }
        }
        close $ifh;
        close $ofh;
    

        # report on the categories of the selected ORFs
        print STDERR "\n#####################\nCounts of kept entries according to attributes:\n";
        foreach my $att (reverse sort {$att_counter{$a}<=>$att_counter{$b}} keys %att_counter) {
            my $count = $att_counter{$att};
            print STDERR join("\t", $att, $count) . "\n";
        }
        print STDERR "########################\n\n\n";
    }
    
    # index the current gff file:
    my $cmd = "$UTIL_DIR/index_gff3_files_by_isoform.pl $gff3_file";
    &process_cmd($cmd);
    
    # retrieve the best entries:
    $cmd = "$UTIL_DIR/gene_list_to_gff.pl $acc_file $gff3_file.inx > $cds_file.best_candidates.gff3";
    &process_cmd($cmd);
    
    
    ##############################
    ## Generate the final outputs.
    ##############################

    my $final_output_prefix = basename($transcripts_file) . ".transdecoder";
    
    {
                        
        # exclude shadow orfs (smaller orfs in different reading frame that are eclipsed by longer orfs)
        $cmd = "$UTIL_DIR/remove_eclipsed_ORFs.pl $cds_file.best_candidates.gff3 > $cds_file.eclipsed_removed.gff3";

        my $target_final_file = "$cds_file.eclipsed_removed.gff3";
        
        &process_cmd($cmd);

        
        if ($single_best_orf_flag) {
            $cmd = "$UTIL_DIR/single_best_ORF_per_transcript.pl ";
            if ($retain_blastp_hits_file) {
                $cmd .= " --blast_hits $retain_blastp_hits_file ";
            }
            if ($retain_pfam_hits_file) {
                $cmd .= " --pfam_hits $retain_pfam_hits_file ";
            }
            $cmd .= " --gff3_file $cds_file.eclipsed_removed.gff3 > $cds_file.single_best_orf.gff3";
            
            &process_cmd($cmd);

            $target_final_file = "$cds_file.single_best_orf.gff3";
        }

        
        &process_cmd("cp $target_final_file $final_output_prefix.gff3");
                
        ## write final outputs:
        
        ## make a BED file for viewing in IGV
        my $gff3_file = "$final_output_prefix.gff3";
        my $bed_file = $gff3_file;
        $bed_file =~ s/\.gff3$/\.bed/;
        $cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
        &process_cmd($cmd);
        
    
        # make a peptide file:
        my $best_pep_file = $gff3_file;
        $best_pep_file =~ s/\.gff3$/\.pep/;
        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl --gff3 $gff3_file --fasta $transcripts_file $genetic_code > $best_pep_file";
        &process_cmd($cmd);
        
        

        # make a CDS file:
        my $best_cds_file = $best_pep_file;
        $best_cds_file =~ s/\.pep$/\.cds/;
        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl --gff3 $gff3_file --fasta $transcripts_file --seqType CDS $genetic_code > $best_cds_file";
        &process_cmd($cmd);
        
    }
    
    print STDERR "transdecoder is finished.  See output files $final_output_prefix.\*\n\n\n";
    
    
    
    exit(0);
}


####
sub process_cmd {
	my ($cmd) = @_;

	print "CMD: $cmd\n";
	my $ret = system($cmd);

	if ($ret) {
		die "Error, cmd: $cmd died with ret $ret";
	}
	
	return;

}



####
sub parse_pfam_hits {
    my ($pfam_hits_file) = @_;
    
    my %has_pfam_hit;
    
    if (! -e $pfam_hits_file) {
        die "Error, cannot find pfam hits file: $pfam_hits_file";
    }
    
    print "PFAM output found and processing...\n";
    # capture those proteins having pfam hits
    open (my $fh, $pfam_hits_file) or die "Error, cannot open file: $pfam_hits_file";
    while (my $ln=<$fh>) {
        next if $ln=~/^\#/;
        my @x = split(/\s+/,$ln);
        next unless $x[3];  # domtbl
        my $orf_acc = $x[3];
        $has_pfam_hit{$orf_acc} = 1;
    }
    close $fh;
    
    
    return(%has_pfam_hit);
}

####
sub parse_blastp_hits_file {
    my ($blastp_file) = @_;

    unless (-e $blastp_file) {
        die "Error, cannot find file $blastp_file";
    }

    my %blastp_hits;

    open (my $fh, $blastp_file) or die "Error, cannot open file $blastp_file";
    while (<$fh>) {
        chomp;
        my @x = split(/\t/);
        my $id = $x[0];

        $blastp_hits{$id} = 1;
    }
    close $fh;

    return(%blastp_hits);
}




sub check_program() {
    my @paths;
    foreach my $prog (@_) {
        my $path = `which $prog`;
        unless ($path =~ /\w/) {
            die "Error, path to a required program ($prog) cannot be found\n\n"
        }
        chomp($path);
        $path = readlink($path) if -l $path;
        push( @paths, $path );
    }
    return @paths;
}