#! /usr/bin/perl -w

# lambda.pl $Rev$ - An estimator for Dawg's lambda
# Copyright (2004-2005) Reed A. Cartwright.  All rights reserved.
#
# Usage: perl lambda.pl [treefile] [fastafile]
#
# Distributed under the same license as DAWG
#

use strict;

# options
my $user_dist = 0; # change to 1 to enable the display of the US gap distribution
my $chisq_bins = 0; # change to 1 to see the chisq bins

# Table for Zipf Estimation [1.01..3.0 by 0.01]
my $ZetaTableS = 1.01;
my $ZetaTableI = 0.01;
my @ZetaTable = (100.57794333849678, 50.57867004101556, 
	33.91272910377197, 25.580120524770102, 20.58084430203699, 
	17.248233766955963, 14.868003203314164, 13.083009752064346, 
	11.694840712912123, 10.584448464950801, 9.676075430586256, 
	8.91921655749934, 8.2789068089333, 7.730171158143642, 7.254694585068123, 
	6.83874082424536, 6.4718056722916035, 6.14571919288521, 
	5.854031435966521, 5.591582441177752, 5.354195097462891, 
	5.138451768231236, 4.941529188217385, 4.761074636126193, 
	4.5951118258429435, 4.441968515056512, 4.300220200855159, 
	4.168645880551769, 4.0461929654417865, 3.931949211809544, 
	3.8251200846430273, 3.725010365652108, 3.6310091052834443, 
	3.5425772302621485, 3.45923727555487, 3.380564827672348, 
	3.306181355544513, 3.2357481733626594, 3.168961332215122, 
	3.105547277977581, 3.045259144634053, 2.98787357712343, 
	2.9331879975097537, 2.8810182439474383, 2.8311965244511157, 
	2.783569637565062, 2.737997420181188, 2.69435138937978, 
	2.6525135505754367, 2.612375348685488, 2.5738367426903555, 
	2.5368053869753955, 2.501195905351019, 2.4669292457372296, 
	2.433932105246206, 2.402136416863166, 2.371478890160818, 
	2.3419005995261015, 2.3133466142621635, 2.2857656656801293, 
	2.2591098469359414, 2.2333343419152283, 2.2083971799386637, 
	2.184259013463686, 2.1608829163060492, 2.1382342002048644, 
	2.116280247814644, 2.0949903604333366, 2.074335618971391, 
	2.0542887568377517, 2.0348240435678777, 2.015917178149418, 
	1.9975451911156323, 1.9796863545771772, 1.962320099451342, 
	1.9454269392258112, 1.9289883996629098, 1.9129869539112034, 
	1.897405962545327, 1.882229618102822, 1.867442893729346, 
	1.853031495581547, 1.8389818186706854, 1.8252809058602786, 
	1.8119164097580231, 1.7988765572664147, 1.7861501165781457, 
	1.7737263664218121, 1.7615950673809382, 1.749746435125061, 
	1.73817111540579, 1.7268601606835563, 1.7158050082623097, 
	1.7049974598198623, 1.694429662231051, 1.6840940895894492, 
	1.6739835263411282, 1.6640910514510443, 1.6544100235290304, 
	1.6449340668482264, 1.6356570581940908, 1.626573114486998, 
	1.617676581125839, 1.608962021004109, 1.600424204153639, 
	1.5920580979745251, 1.583858858012912, 1.575821819251114, 
	1.5679424878771757, 1.560216533503362, 1.5526397818052655, 
	1.5452082075552418, 1.5379179280257507, 1.5307651967398803, 
	1.5237463975479324, 1.5168580390103819, 1.5100967490688908, 
	1.5034592699882794, 1.4969424535535234, 1.4905432565068935, 
	1.484258736211354, 1.4780860465272285, 1.4720224338900036, 
	1.4660652335779072, 1.4602118661586483, 1.4544598341053465, 
	1.4488067185723421, 1.4432501763221333, 1.437787936795243, 
	1.432417799315324, 1.4271376304222598, 1.4219453613265007, 
	1.4168389854782233, 1.411816556245341, 1.4068761846947044, 
	1.4020160374711905, 1.3972343347696727, 1.392529348395166, 
	1.3878993999066986, 1.3833428588407357, 1.3788581410101926, 
	1.3744437068753212, 1.370098059982948, 1.3658197454707395, 
	1.3616073486333566, 1.3574594935475333, 1.3533748417532712, 
	1.3493520909884928, 1.3453899739746535, 1.341487257250917, 
	1.3376427400546582, 1.3338552532461516, 1.330123658275428, 
	1.326446846189378, 1.3228237366772906, 1.319253277153097, 
	1.3157344418726857, 1.3122662310847368, 1.3088476702135943, 
	1.3054778090727805, 1.302155721107818, 1.2988805026670913, 
	1.2956512722995475, 1.2924671700780854, 1.2893273569475512, 
	1.2862310140962918, 1.283177342350291, 1.2801655615889367, 
	1.2771949101815299, 1.2742646444436798, 1.2713740381127698, 
	1.2685223818417177, 1.2657089827102934, 1.262933163753284, 
	1.2601942635048347, 1.2574916355583214, 1.2548246481411414, 
	1.2521926837038304, 1.2495951385229584, 1.2470314223172532, 
	1.2445009578764485, 1.2420031807023697, 1.2395375386617788, 
	1.2371034916505452, 1.2347005112686997, 1.2323280805059718, 
	1.2299856934374145, 1.2276728549287421, 1.2253890803510186, 
	1.2231338953043551, 1.2209068353502854, 1.218707445752503, 
	1.2165352812256536, 1.2143899056919019, 1.2122708920449774, 
	1.210177821921451, 1.2081102854789683, 1.2060678811812071, 
	1.2040502155893134, 1.202056903159594);

my @ZetaRatioTable = ( -99.42465416119387, -49.42651278295983, -32.7616936349154, 
	-24.430196816907994, -19.432022427470404, -16.100503897177585, 
	-13.721355607717234, -11.937434796674157, -10.550328859139627, 
	-9.44099026836784, -8.53366153972204, -7.777837712657673, 
	-7.1385538410301885, -6.590834987564802, -6.116366220546042, 
	-5.701411361755471, -5.335466293967182, -5.010361165950622, 
	-4.719646111688432, -4.458161253833933, -4.221729563328257, 
	-4.00693348458076, -3.8109498323455355, -3.631425964382572, 
	-3.4663856726837654, -3.3141567921116177, -3.1733148960102397, 
	-3.0426390570454855, -2.921076760977479, -2.8077158376812346, 
	-2.7017618248754207, -2.6025195761543083, -2.5093782130161015, 
	-2.4217987324177765, -2.3393037387510347, -2.2614688871575184, 
	-2.187915714416261, -2.118305601798002, -2.052334666712738, 
	-1.989729420611674, -1.9302430623196227, -1.8736523008927266, 
	-1.819754621799693, -1.7683659258977484, -1.7193184832130266, 
	-1.6724591536202988, -1.6276478346710939, -1.58475610344434, 
	-1.543666024702038, -1.5042691020671974, -1.466465352593545, 
	-1.4301624881165254, -1.3952751892823985, -1.3617244602415755, 
	-1.3294370537398257, -1.2983449578075912, -1.268384936482713, 
	-1.2394981180452773, -1.2116296251275196, -1.1847282418133533, 
	-1.158746113482776, -1.1336384757041358, -1.1093634089466802, 
	-1.0858816162892664, -1.06315622164869, -1.0411525863512565, 
	-1.0198381421311062, -0.9991822388642257, -0.9791560055431779, 
	-0.9597322231683802, -0.940885208380976, -0.9225907067928715, 
	-0.9048257950839338, -0.8875687910368727, -0.8707991707687682, 
	-0.8544974924961801, -0.838645326239606, -0.8232251889339728, 
	-0.8082204844657707, -0.79361544820532, -0.7793950956451932, 
	-0.7655451747936654, -0.7520521220058026, -0.7389030209648924, 
	-0.726085564553807, -0.7135880193799546, -0.7013991927389883, 
	-0.689508401821763, -0.6779054449863241, -0.6665805749322685, 
	-0.6555244736287632, -0.6447282288600555, -0.6341833122635604, 
	-0.6238815587457621, -0.613815147170221, -0.6039765822201625, 
	-0.5943586773454108, -0.5849545387099745, -0.5757575500624081, 
	-0.5667613584562456, -0.5579598607523722, -0.5493471908392097, 
	-0.5409177075100653, -0.5326659829399969, -0.5245867917070516, 
	-0.5166751003048036, -0.5089260570947702, -0.5013349826484628, 
	-0.49389736042966514, -0.4866088277678791, -0.47946516707388387, 
	-0.4724622972478885, -0.46559626522991804, -0.45886323764076514, 
	-0.45225949246012226, -0.4457814106862988, -0.43942546791927434, 
	-0.4331882258056473, -0.4270663232803669, -0.4210564675358515, 
	-0.4151554246442902, -0.4093600097534363, -0.4036670767700985, 
	-0.39807350743870257, -0.39257619971471763, -0.3871720553243887, 
	-0.38185796639297176, -0.3766308010135526, -0.3714873876174255, 
	-0.36642449799486676, -0.3614388288019101, -0.3565269813743003, 
	-0.35168543965413074, -0.3469105460176611, -0.34219847477435444, 
	-0.3375452030872263, -0.33294647904299474, -0.3283977865772394, 
	-0.32389430693462035, -0.3194308763171518, -0.31500193934436266, 
	-0.3106014979178754, -0.30622305504926456, -0.3018595531739829, 
	-0.2975033064354322, -0.2931459263818405, -0.2887782404742653, 
	-0.284390202756679, -0.279970795988474, -0.275507924485767, 
	-0.2709882968603094, -0.266397297783542, -0.26171884783810045, 
	-0.25693525044975124, -0.25202702481907663, -0.24697272369407414, 
	-0.24174873474194597, -0.23632906419054672, -0.23068510131703507, 
	-0.2247853622629635, -0.21859521155122816, -0.21207655957064997, 
	-0.20518753417837918, -0.1978821244484786, -0.19010979446683013, 
	-0.18181506493763538, -0.17293706022511063, -0.1634090183052305, 
	-0.15315776094644973, -0.1421031212749423, -0.13015732570897476, 
	-0.11722432706829741, -0.10319908547787622, -0.08796679349060989, 
	-0.07140204165094988, -0.053367920510283995, -0.03371505488566567, 
	-0.012280565925787676, 0.011113043687890184, 0.03665908632130593, 
	0.06456797291826083, 0.09506858891097204, 0.1284097682092414, 
	0.16486187124327872, 0.20471847331934726, 0.24829816983861908, 
	0.29594650522777877, 0.3480380327342986, 0.40497851254960615, 
	0.46720725603869884, 0.5351996241746824, 0.6094696886000213, 
	0.6905730640625012, 0.7791099213014138, 0.8757281897876625, 
	0.9811269600483958, 1.0960600956316162, 1.2213400650864328, 
	1.35784200464962, 1.5065080226352798);

my $gettree = 1;
my $tree = '';
my %seqs = ();
my $seqname = '';

# process STDIN line by line
while(<>)
{
	# remove newline and flanking whitespace
	chomp;   
	s/^\s+//;
	s/\s+$//;
	#check to see if the programs should shift to reading sequences		
	$gettree = 0 if($gettree && m/^>/);
	if($gettree)
	{
		$tree .= $_;
	}
	#check to see if the line is a fasta sequence name
	elsif(/^>\s*(.+)\s*/)
	{
		$seqname = $1;
		$seqs{$seqname} ||= '';
	}
	#append the line to the sequence
	else
	{
		$seqs{$seqname} .= $_;
	}
}

#calculate the total length of the tree by extracting all the branch lengths
my $totlen = 0;
$totlen += $1 while($tree =~ m/:([\d\.]+)/g);

#convert sequences to a table
my @table = values(%seqs);

my %gaps = ();

my $avgL = 0;
#find unique gaps and count the ungapped length of the sequences
foreach(@table)
{
	while(/([-=]+)/g)
	{
		my $m = $1;
		my $e = pos;
		my $b = $e - length($m);
		$gaps{$b, $e} = 1;
	}
	while(/([+=]+)/g)
	{
		my $m = $1;
		my $e = pos;
		my $b = $e - length($m);
		$gaps{$b, $e} = 1;
	}
	$avgL += tr/ACGTacgt//;
}
#find average length
$avgL /= @table;

#count gaps
my $numgaps = keys(%gaps);

#calculate lambda estimate
my $lambda = $numgaps/$avgL/$totlen;

#calculate distribution of gaps sizes
my %gapsizes = ();
my $maxgap = 0;
my $suml = 0;
foreach(keys(%gaps))
{
	my ($f, $l) = split(/$;/);
	$l -= $f;
	$gapsizes{$l} ||=0;
	$gapsizes{$l}++;
	$maxgap = $l if($l > $maxgap);
	$suml += $l;
}

my @gapnum = map {exists($gapsizes{$_}) ? $gapsizes{$_} : 0 } (1..$maxgap);
my @xsqbins = ();
for(my $s = 1; $s <= $maxgap; $s *= 2)
{
	push(@xsqbins, $s*2);
}
my @xsq_ob = ();
my $x = 1;
foreach my $xm (@xsqbins)
{
	my $bc = 0;
	for(;$x<$xm && $x<=$maxgap;++$x) {$bc += $gapnum[$x-1];}
	push(@xsq_ob, $bc);
}

my $avgG = $suml/$numgaps-1.0;

my $gapclasses = keys(%gapsizes);

my %model = ();

# Geometric Model
#MLE of q
my $q = $avgG/($avgG+1.0);
#calculate LL(q)
my $LL = $numgaps*(log(1.0-$q)-log($q))+$suml * log($q);
#calculate BIC 
my $bic = -2*$LL+log($numgaps)*1.0;
my $aic = -2*$LL+2*1.0;
#calculate xsq
my $xsq = 0.0;
my $df = $gapclasses-2;
while(my($g,$n) = each(%gapsizes))
{
	my $e = $numgaps*((1.0-$q) * $q**($g-1.0));
	$xsq += ($n-$e)**2/$e;
}
$xsq = 0.0;
$df = 0;
$x = 1;
my @xsq_ex = ();
foreach my $xm (@xsqbins)
{
	my $bc = 0;
	for(;$x<$xm && $x<=$maxgap;++$x) {$bc += ((1.0-$q) * $q**($x-1.0));}
	push(@xsq_ex, $numgaps*$bc);
}

for(0..$#xsq_ex)
{
	next unless($xsq_ob[$_]);
	$df += 1;
	$xsq += ($xsq_ob[$_]-$xsq_ex[$_])**2/$xsq_ex[$_];
}
$df -= 2;
#calculate rsq
my @L1 = ();
my @L2 = ();
while(my($g,$n) = each(%gapsizes))
{
	push(@L1, log($n) - log($numgaps));
	push(@L2, log(1-$q)+log($q)*($g-1));
}
my $r = rsq([@L1],[@L2]);

$model{Geometric} = {AIC => $aic, BIC => $bic, LL => $LL, DF => $df, XSQ => $xsq, Params => {'q' => $q}, text => "GapModel = \"NB\"\nGapParams = {1, $q}", f => 'f(x) = (1-q)*q^(x-1)', RSQ => $r};

# Zipf's Power-Law Model
my $plmle = 0;
for(0..$#gapnum)
{
	$plmle += $gapnum[$_]*log($_+1);
	
}
$plmle /= -$numgaps;
my $i = 0;
while($i < @ZetaRatioTable && $ZetaRatioTable[$i] < $plmle)
{
	$i++;
}

#if $i == 0, undefined
if($i == 0)
{
	$model{PowerLaw} = {AIC => 'NA', BIC => 'NA', LL => 'NA', DF => 'NA',
			XSQ => 'NA', text => '',
		Params => {a => '<'.$ZetaTableS, z => $plmle},
		f => 'f(x) = x^(-a)/Zeta(a)', RSQ => 'NA'};
}
elsif($i == @ZetaRatioTable)
{
	$model{PowerLaw} = {AIC => 'NA', BIC => 'NA', LL => 'NA', DF => 'NA', XSQ => 'NA', test => '',
		Params => {a => '<'.$ZetaTableS+$ZetaTableI*@ZetaRatioTable, z => $plmle},
	f => 'f(x) = x^(-a)/Zeta(a)', RSQ => 'NA'};
	
}
else
{
	my $x1 = $ZetaRatioTable[$i-1];
	my $x2 = $ZetaRatioTable[$i];
	my $y1 = ($i-1)*$ZetaTableI+$ZetaTableS;
	my $y2 = $y1+$ZetaTableI;
	my $a = $y1+($plmle-$x1)*($y2-$y1)/($x2-$x1);
	$x1 = $ZetaTable[$i-1];
	$x2 = $ZetaTable[$i];
	my $b = $y1+($plmle-$x1)*($y2-$y1)/($x2-$x1);
	$LL = 0;
	$LL += log($_) * $gapsizes{$_} foreach(keys(%gapsizes));
	$LL = -$numgaps * log($b) - $a * $LL;
	$bic = -2*$LL+log($numgaps)*2.0;
	$df = $gapclasses-2;
	$aic = -2*$LL+2*1.0;
	$xsq = 0.0;
	my $S = 0;
	my $F = 0;
	for(0..$#gapnum)
	{
		my $n = $gapnum[$_];
		next unless($n);
		my $e = $numgaps*($_+1)**-$a/$b;
		$xsq += ($n-$e)**2/$e;
	}
	$xsq = 0.0;
	$df = 0;
	$x = 1;
	@xsq_ex = ();
	foreach my $xm (@xsqbins)
	{
		my $bc = 0;
		for(;$x<$xm && $x<=$maxgap;++$x) {$bc += $x**-$a/$b;}
		push(@xsq_ex, $numgaps*$bc);
	}
	
	for(0..$#xsq_ex)
	{
		next unless($xsq_ob[$_]);
		$df += 1;
		$xsq += ($xsq_ob[$_]-$xsq_ex[$_])**2/$xsq_ex[$_];
	}
	$df -= 2;
	@L1 = ();
	@L2 = ();
	while(my($g,$n) = each(%gapsizes))
	{
		push(@L1, log($n) - log($numgaps));
		push(@L2, -$a * log($g) - log($b));
	}
	$r = rsq([@L1],[@L2]);
	my $avgLi = int($avgL);
	$model{PowerLaw} = {AIC => $aic, BIC => $bic, LL => $LL, DF => $df,
			XSQ => $xsq, Params => {a => $a, z => $plmle},
			text => "GapModel = \"PL\"\nGapParams = {$a, $avgLi}",
			f => 'f(x) = x^(-a)/Zeta(a)', RSQ => $r};
}

#output
if($user_dist)
{
	print "\nGap Size Distribution:\n";
	print join("\t", $_, $gapsizes{$_} || 0, ($gapsizes{$_} || 0)/$numgaps), "\n" foreach(1..$maxgap);
}
if($chisq_bins)
{
	print "\nChi-Squared Bins:\n";
	my $xi = 1;
	foreach(@xsq_ob)
	{
		print "\t$xi-";
		$xi*=2;
		print $xi-1, "\t$_\n";
	}
}
print "# Total Tree Length is $totlen.\n";
print "Tree = $tree\n";
print "\n# Average Sequence Length is $avgL.\n";
print "Length = $avgL.\n";
print "\n# Lambda Estimate is $lambda.\n";
print "Lambda = $lambda\n";
print "\n# Number of gaps is $numgaps.\n";
print "# Average Gap Size is ", $avgG + 1.0, ".\n";

print "\n# Estimated Models\n";

foreach my $k (sort(keys(%model)))
{
	print "# $k:\n#    $model{$k}->{f}\n#    ";
	my @par = ();
	foreach my $p (sort(keys(%{$model{$k}->{Params}})))
	{
		push(@par, "$p = $model{$k}->{Params}->{$p}");
	}
	print join(', ', @par), "\n";
	print "#    LogLik     = $model{$k}->{LL}\n";
	print "#    AIC        = $model{$k}->{AIC}\n";
	print "#    BIC        = $model{$k}->{BIC}\n";
	print "#    Xsq        = $model{$k}->{XSQ} ($model{$k}->{DF} df)\n";
	#print "#    Rsq of Log = $model{$k}->{RSQ}\n";
	my $text = $model{$k}->{text};
	$text =~ s/^/# /mg;
	print $text, "\n";
}

my $k_ll;

foreach my $k(keys %model)
{
	$k_ll = $k if( !(defined $k_ll) || $model{$k_ll}->{LL} < $model{$k}->{LL});
}

print "\n# Most Likely Model\n";
print $model{$k_ll}->{text}, "\n";

sub rsq
{
	my ($rO, $rE) = @_;
	my @obs = @$rO;
	my @est = @$rE;
	my $m = 0.0;
	my $m2 = 0.0;
	my $SSe = 0.0;
	for my $i (0..$#obs)
	{
		$SSe += ($obs[$i] - $est[$i])*($obs[$i] - $est[$i]);
		$m += $obs[$i];
		$m2 += $obs[$i]*$obs[$i];
	}
	return 1.0 - $SSe/($m2-$m*$m/@obs);
	
}




