File: EM.pm

package info (click to toggle)
trinityrnaseq 2.15.2%2Bdfsg-1
links: PTS, VCS
area: main
in suites: trixie
size: 468,004 kB
sloc: perl: 49,905; cpp: 17,993; java: 12,489; python: 3,282; sh: 1,989; ansic: 985; makefile: 717; xml: 62
file content (516 lines) | stat: -rwxr-xr-x 11,394 bytes
parent folder | download | duplicates (5)
package EM;

use strict;
use warnings;
use SAM_entry;
use Carp;

use vars qw($TOKEN);
BEGIN {
	$TOKEN = "$;";
}

my $MIN_EFF_TRANS_LENGTH = 10; # // large enough to keep abundance estimates from going absurdly high for short transcripts.

####
sub  new {
	my $packagename = shift;
	my ($transcript_seqs_href, $fragment_length) = @_;
	
    unless ($fragment_length) {
        $fragment_length = 1; # negligible with respect to the length of the transcript
    }


	# transcript_seqs_href:  ( seq_acc => sequence, ...)
	# trans_reads_aref: ( [trans_accA, read_accA], [trans_accB, read_accB], ...)
	
	unless ($transcript_seqs_href) {
		confess "Error, need param: transcript_seqs_href";
	}
	
	

	my $self = {
		
		# mapping info
		_trans_to_multi_map_counts => {}, # trans => { transA\$;transB\$;transC... } => count_of_reads

		# model params
		_ENt => {}, # trans => expected read count
		_theta => {}, # trans => fraction of all reads
		
		# transcript info
		_trans_lengths => {}, # trans => length
        _frag_length => $fragment_length,
        
				
		# misc
		_total_reads => 0, # total read count

	};
	

	bless ($self, $packagename);

	$self->_init_trans_lengths($transcript_seqs_href);
	
	return($self);

}


####
sub _init_trans_lengths {
	my $self = shift;
	my ($transcript_seqs_href) = @_;
	
	foreach my $transcript (keys %$transcript_seqs_href) {
		
		my $sequence = $transcript_seqs_href->{$transcript} or confess "Error, no sequence for transcript: $transcript";
		
		$self->{_trans_lengths}->{$transcript} = length($sequence);
		
	}

	return;
}

sub run {
	my $self = shift;
	my (%settings) = @_;

	my $max_iterations = $settings{max_iterations} || 1000;
	my $min_delta_ML = $settings{min_delta_ML} || 0.01;
    my $verbose_flag = $settings{verbose} || 0;

	
	$self->_init_theta();
	
	#########################
	## Do the EM:
	
	my $prev_ML_val = undef;
	
	my $round = 0;
	my $delta = 1;
	
	print STDERR "\tEM started\n";
	while (1) {
	
		$round++;
		
		$self->_E_step();

		$self->_M_step();

		
		my $ml = $self->_compute_Max_Likelihood();
		
		if (defined $prev_ML_val) {
			$delta = $ml - $prev_ML_val;
		}
		
		print STDERR "\rML[$round]: $ml       $delta       ";
		
		$prev_ML_val = $ml;
		
        if ($verbose_flag) {
            print "\n\nEM_results, round: $round:\n";
            $self->report_results();
            print "\n";
        }
        
		if ($round >= $max_iterations || $delta  < $min_delta_ML) {
			
			last;
		}

	}
	print STDERR "\n\tEM finished.\n";
	
	return;
}


####
sub _init_theta {
	my $self = shift;

	###########################
	## init theta:
	##   assume mutliply mapped reads equally divided among their mapped locations for init
	
	my @transcripts = $self->_get_all_transcripts();

	my $total_reads = $self->{_total_reads};
	
	foreach my $transcript (@transcripts) {
		
		my $read_count_sum = 0;
		
		my @combos = $self->_get_multi_map_list_including_transcript($transcript);
		
		foreach my $combo (@combos) {
			
			my @other_trans = $self->_get_transcripts_in_combo($combo);
			
			my $num_other = scalar(@other_trans);
			my $count = $self->_get_multi_map_read_count_for_combo($transcript, $combo);
			
			$read_count_sum += ($count/$num_other);
		}
				
		$self->{_theta}->{$transcript} = $read_count_sum / $total_reads;

	}

	return;
}

####
sub _get_all_transcripts {
	my $self = shift;
	
	my @transcripts = keys %{$self->{_trans_to_multi_map_counts}};
	
	return(@transcripts);
}


####
sub _get_reads_mapped_to_transcript {
	my $self = shift;
	my ($transcript) = @_;

	if (exists $self->{_trans_to_reads_href}->{$transcript}) {
		my @reads = @{$self->{_trans_to_reads_href}->{$transcript}};
		return(@reads);
	}
	else {
		confess "Error, no reads mapped to transcript $transcript";
	}
}

####
sub _get_transcript_length {
	my $self = shift;
	my ($transcript) = @_;

	return($self->{_trans_lengths}->{$transcript});
}




####
sub _E_step {
	my $self = shift;

	my @transcripts = $self->_get_all_transcripts();
	
	## do the E
	foreach my $transcript (@transcripts) {
		
		my $expected_read_count = 0;
		
		
		
		my $trans_length = $self->_get_transcript_length($transcript) or die "Error, no length for transcript: $transcript";
        my $eff_trans_length = $trans_length - $self->{_frag_length} + 1;
        if ($eff_trans_length < $MIN_EFF_TRANS_LENGTH) {
            $eff_trans_length = $MIN_EFF_TRANS_LENGTH;
        }
        

		my $theta_t = $self->{_theta}->{$transcript};
		if (! defined $theta_t) {
			die "Error, no theta($transcript), $theta_t";
		}
		
		
		my @combos = $self->_get_multi_map_list_including_transcript($transcript);
		
		#print "Combos: @combos\n";
		

	    foreach my $combo (@combos) {
						
			my $combo_count = $self->_get_multi_map_read_count_for_combo($transcript, $combo);
			
			#print "Got combo: $combo, count: $combo_count\n";
						
			my @other_transcripts = split(/$TOKEN/, $combo);
			if (scalar @other_transcripts == 1) {
				# only current transcript:
				unless ($other_transcripts[0] eq $transcript) {
					confess "Error, mapped single transcript should be the current transcript";
				}
				$expected_read_count += $combo_count;
			}
			else {
				## compute partial mapping.
				
							
				my $numerator = (1 / $eff_trans_length) * $theta_t;
                #my $numerator = $theta_t;
                
			
				# demonator: sum for all t: P(r|t) * theta(t)
				my $denominator = 0;
			
				
				#print STDERR "Other transcripts: @other_transcripts\n";
				
				foreach my $other_transcript (@other_transcripts) {  # other transcripts includes the current transcript as well.
				
					my $other_trans_len = $self->_get_transcript_length($other_transcript);
					unless ($other_trans_len) {
						die "Error, no trans length for $other_transcript";
					}
					
                    my $eff_other_trans_length = $other_trans_len - $self->{_frag_length} + 1;
                    if ($eff_other_trans_length < $MIN_EFF_TRANS_LENGTH) {
                        $eff_other_trans_length = $MIN_EFF_TRANS_LENGTH;
                    }
                    
					my $other_theta = $self->{_theta}->{$other_transcript};
					
					if (! defined $other_theta) {
						die "no theta for $other_transcript, $other_theta";
					}
					
					my $val = (1 / $eff_other_trans_length) * $other_theta;
                    #my $val = $other_theta;
                    
					$denominator += $val;
				}
			
				#print "Fractional read count for $transcript = $numerator / $denominator\n";
			
				my $fractional_read_count = $combo_count * ($numerator/$denominator);
				
				
				$expected_read_count += $fractional_read_count;
			}
		}
		
		$self->{_ENt}->{$transcript} = $expected_read_count;
	}
	
	return;
}
						
			
####
sub _M_step {
	my $self = shift;
	
	my $sum_NT = 0;

	foreach my $E (values %{$self->{_ENt}}) {
		$sum_NT += $E;
	}
	
	## theta = ENt(t) / sum( for all t: ENt(t) )

	
	foreach my $transcript ($self->_get_all_transcripts()) {
		my $new_theta = $self->{_ENt}->{$transcript} / $sum_NT;
		
		$self->{_theta}->{$transcript} = $new_theta;
	}

	return;
}


####
sub _compute_Max_Likelihood {
	my $self = shift;
	
	# ML = sum:t (ENt * log(theta(t)))

	my $ML = 0;

	foreach my $transcript ($self->_get_all_transcripts()) {
		
		my $theta = $self->{_theta}->{$transcript};
		
		if ($theta > 0) {
			
			my $ENt = $self->{_ENt}->{$transcript};

			$ML += $ENt * log($theta);
			
		}
	}

	return($ML);
}


####
sub get_results {
	my $self = shift;
	
	my $total_reads = $self->{_total_reads};

    my @results;
	
	foreach my $transcript (sort $self->_get_all_transcripts()) {
		
		my $num_frags = $self->{_ENt}->{$transcript};
		my $len = $self->_get_transcript_length($transcript);
		
		$num_frags = sprintf("%.2f", $num_frags);
		
		my $fpkm = $num_frags / ($len/1e3) / ($total_reads/1e6);
		$fpkm = sprintf("%.2f", $fpkm);

		my ($num_unique_reads, $num_multi_map_reads) = $self->count_reads_mapped_to_transcript($transcript);
		
        my $struct = { trans_id => $transcript,
                       length => $len,
                       unique_map => $num_unique_reads,
                       multi_map => $num_multi_map_reads,
                       expected_map => $num_frags,
                       FPKM => $fpkm,
        };

        push (@results, $struct);


		
	}
	

	return (@results);
}


sub print_results {
    my $self = shift;
    
    print $self->report_results();
    return;
}



sub report_results {
    my ($self) = shift;

    my @results = $self->get_results();
        
    my $out_text = join("\t", "#transcript", "trans_length", "unique_map", "multi_map", "EM_frag_count", "FPKM") . "\n";
    
    foreach my $result (@results) {
            
        $out_text .= join("\t", 
                          $result->{trans_id},
                          $result->{length},
                          $result->{unique_map},
                          $result->{multi_map},
                          $result->{expected_map},
                          $result->{FPKM}) . "\n";
    }
    
    $out_text .= "\n"; # last spacer
    
    return ($out_text);

}






####
sub _get_multi_map_list_including_transcript {
	my $self = shift;
	my ($transcript) = @_;

	my @combos = keys %{$self->{_trans_to_multi_map_counts}->{$transcript}};
	return(@combos);
}

####
sub _get_multi_map_read_count_for_combo {
	my $self = shift;
	my ($transcript, $combo) = @_;
	
	my $val = $self->{_trans_to_multi_map_counts}->{$transcript}->{$combo};
	
	return($val);
}

####
sub _get_transcripts_in_combo {
	my $self = shift;
	my ($combo) = @_;

	my @transcripts = split(/$TOKEN/, $combo);
	return(@transcripts);
}


####
sub _create_combo {
	my $self = shift;
	my @transcripts = @_;

	my $combo = join($TOKEN, sort @transcripts);
	
	return($combo);
}

####
sub add_read { # really add_fragment: count pairs only once.
	my $self = shift;
	my @transcripts = @_;

	my $combo = $self->_create_combo(@transcripts);

	foreach my $transcript (@transcripts) {
		$self->{_trans_to_multi_map_counts}->{$transcript}->{$combo}++;
	}

	$self->{_total_reads}++;

	return;
}

####
sub count_reads_mapped_to_transcript {
	my $self = shift;
	my ($transcript) = @_;
	
	my $unique_read_count = 0;
	my $multi_map_count = 0;


	my @combos = $self->_get_multi_map_list_including_transcript($transcript);                                                       
	
	foreach my $combo (@combos) {                                                                                                    
                                                                                                                                         
		my $combo_count = $self->_get_multi_map_read_count_for_combo($transcript, $combo);                                           
		
		
		my @other_transcripts = $self->_get_transcripts_in_combo($combo);
		
		if (scalar @other_transcripts == 1) {                                                                                        
			# only current transcript:                                                                                               
			$unique_read_count += $combo_count;
		}                                                                                                                            
		else {                                   
			$multi_map_count += $combo_count;
		}
	}

	return($unique_read_count, $multi_map_count);
}

1; #EOM