File: create_start_PSSM.pl

package info (click to toggle)
transdecoder 3.0.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 11,356 kB
  • ctags: 274
  • sloc: perl: 5,454; sh: 81; makefile: 51
file content (131 lines) | stat: -rwxr-xr-x 3,140 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env perl

use strict;
use warnings;

use FindBin;
use lib ("$FindBin::Bin/../PerlLib");
use Gene_obj;
use Gene_obj_indexer;
use GFF3_utils;
use Carp;
use Fasta_retriever;
use Nuc_translator;

$|++;

my $usage = "\n\nusage: $0 gene_db.inx_file transcripts.fasta train_accs.list pre_length post_length\n\n";

my $inx_file = $ARGV[0] or die $usage;
my $transcripts_fasta = $ARGV[1] or die $usage;
my $train_accs_file = $ARGV[2] or die $usage;
my $pre_length = $ARGV[3] or die $usage;
my $post_length = $ARGV[4] or die $usage;


main: {

    my $gene_obj_indexer = new Gene_obj_indexer( { "use" => "$inx_file" } );

    my $fasta_retriever = new Fasta_retriever($transcripts_fasta);

    my @train_accs = `cat $train_accs_file`;
    chomp @train_accs;
    
    my @nuc_frequency;
    
    my $seq_counter = 0;

    foreach my $gene_id (@train_accs) {

        my $gene_obj = $gene_obj_indexer->get_gene($gene_id);
        
        my $com_name = $gene_obj->{com_name};

        unless ($com_name =~ /type:complete/ || $com_name =~ /type:3prime_partial/) { next; } # need start codon


        my $orient = $gene_obj->get_orientation();
        my ($lend, $rend) = sort {$a<=>$b} $gene_obj->get_model_span();
        my $contig_id = $gene_obj->{asmbl_id};
        
        my $trans_seq = uc $fasta_retriever->get_seq($contig_id);

        my $seq_length = length($trans_seq);
        
        if ($orient eq '-') {
            $trans_seq = &reverse_complement($trans_seq);
            $lend = $seq_length - $rend + 1;
        }

        my $atg = substr($trans_seq, $lend-1, 3);
        unless ($atg eq "ATG") {
            die "Error, missing ATG start: $atg ";
        }
        
        my $begin_profile = $lend - 1 - $pre_length;
        my $profile_length = $pre_length + 3 + $post_length;

        if ($begin_profile >= 1) {
            my $start_codon_context_substring = substr($trans_seq, $begin_profile, $profile_length);
            #print "$start_codon_context_substring\n";
            &add_to_profile(\@nuc_frequency, $start_codon_context_substring);
            $seq_counter++;
        }
        
        
        
    }
 

    &write_PSSM($pre_length, $seq_counter, \@nuc_frequency);
    
   
    exit(0);
}


####
sub add_to_profile {
    my ($nuc_freq_aref, $context_string) = @_;

    my @chars = split(//, $context_string);

    for (my $i = 0; $i <= $#chars; $i++) {
        my $char = $chars[$i];
        $nuc_freq_aref->[$i]->{$char}++;
    }
    
    return;
}


####
sub write_PSSM {
    my ($pre_length, $num_seqs, $nuc_freq_aref) = @_;
    
    my @nucs = qw(G A T C);
    
    my $atg_pos = $pre_length + 1;
    print "#ATG=$atg_pos\n";
    print "#" . join("\t", @nucs) . "\n";
    
    for (my $i = 0; $i < $#$nuc_freq_aref; $i++) {

        my %char_counts = %{$nuc_freq_aref->[$i]};
        
        my @rel_freqs;
        
        foreach my $nuc (@nucs) {
            my $count = $char_counts{$nuc} || 0;

            my $relative_freq = sprintf("%.3f", $count/$num_seqs);

            push (@rel_freqs, $relative_freq);
        }

        print join("\t", @rel_freqs) . "\n";
    }

    return;
}