File: eval_dualdecomp.pl

package info (click to toggle)
augustus 3.4.0%2Bdfsg2-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 758,480 kB
  • sloc: cpp: 65,451; perl: 21,436; python: 3,927; ansic: 1,240; makefile: 1,032; sh: 189; javascript: 32
file content (334 lines) | stat: -rwxr-xr-x 13,247 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/perl

use strict;
use warnings;
use Getopt::Long; # for parameter specification on the command line

if  (eval {require Statistics::R;1;} ne 1) { # for making R plots
    # if module can't load
    print "perl module Statistics::R not installed.\n";
    print "Please install, e.g. via CPAN. On command line, type:\n\n";
    print "perl -MCPAN -e 'install Statistics::R'\n";
} else {
    Statistics::R->import();
}

my $usage = <<'ENDUSAGE';
eval_dualdecomp.pl    evaluate the effectivness of dual decomposition either on a single
                      AUGUSTUS output file or a directory of AUGUSTUS output files.
SYNOPSIS

eval_dualdecomp.pl [ --f=input_file | --d=input_directory ]

    --f=<file>                          intput AUGUSTUS file 
    --d=<dir>                           directory of input AUGUSTUS files (recognized by .out file extension)

OPTIONS

    --help                              output this help message
    --hist_iterations=<out.pdf>         output histogram of iterations to out.pdf
    --hist_errors=<out.pdf>             output histogram of error estimates to out.pdf for all cases, where
                                        no convergence is achieved.
    --err_per_iter=<out.pdf>            plots the average percentage of initial error against the iterations to out.pdf.
                                        If after a certain number of iterations the error no further drops, 
                                        early stopping is recommended, i.e. in the next run, the number of 
                                        DD iterations can be restricted to this number of iterations.
    --t=<foat>                          threshold for percentage of initial error. For all cases with an estimated
                                        error higher than this threshold, the evolution of primal an dual values
                                        are plotted against the iterations. This helps debugging cases with a
                                        high error estimate. The threshold is between [0-100] (default: 5)
    --outdir=<dir>                      put all plots in this output directory


DESCRIPTION
      
  Example:

    eval_dualdecomp.pl --d=augouts --hist_iterations=iterations.pdf --hist_errors=errors.pdf
    eval_dualdecomp.pl --d=augouts --hist_iterations=iterations.pdf --hist_errors=errors.pdf --err_per_iter=error_per_iter.pdf --outdir=out
    eval_dualdecomp.pl --f=aug.out --t=0

ENDUSAGE


my ($hist_iter, $hist_err, $err_per_iter, $DIR, $FILE, $help); # options
my $t = 5;  # threshold for percentage of initial error
my @docs = ();
my $outdir = "";

GetOptions('d=s'=>\$DIR,
	   'f=s'=>\$FILE,
	   'hist_iterations=s' =>\$hist_iter,
	   'hist_errors=s' =>\$hist_err,
	   'err_per_iter=s' =>\$err_per_iter,
	   't=f' =>\$t,
	   'outdir=s' =>\$outdir,
	   'help!'=>\$help);

if (defined($DIR)){
    $DIR =~ s/\/$//g;
    opendir(my $DH, $DIR) or die "cannot open directory $DIR: $!";
    @docs = grep(/\.out$/,readdir($DH));
    @docs = map "$DIR/$_", @docs;
}
if (defined($FILE)){
    push @docs, $FILE;
}

if(!defined($DIR) && !defined($FILE)){
    print "either an input directory (option --d) or input file (option --f) is required.\n$usage";
    exit(0);
}
if ($outdir ne ""){
    system ("rm -rf $outdir; mkdir $outdir");
    $outdir =~ s/([^\/])$/$1\//g;
}


my $id = 0;                 # gene range ID
my @avgErr = ();            # average error per iteration

my @conv_iter = ();         # stores number of iterations of all cases that did converge
my @conv_best_p = ();       # stores the iterations of the best primal values of all cases that did converge

my @not_conv_iter = ();     # stores number of iterations of all cases that did not converge
my @not_conv_best_p = ();   # stores the iterations of the best primal values of all cases that did not converge
my @not_conv_error = ();    # stores the estimated errors of all cases that did not converge

my $alreadyOpt = 0; # counts number of cases that are already optimal without DD

my @rounds = (); # number of cases that converged per round

# hash of gene ranges
#    keys: gene range nr
#    values: hash reference
#            keys: iter               array of iterations
#                  primal             array of primal values
#                  dual               array of dual values
#                  stepsize           array if step sizes
#                  inconsistencies    array of inconsistencies

my %geneRanges = ();


foreach my $file (@docs) {
    open (RES, $file) or die "could not open file $file\n";
    my $contents = 0;
    while(<RES>){
	if(/dual decomposition on gene Range (\d+)/){
	    $id++;
	    $geneRanges{$id} = {"iter"=>[], "primal"=>[], "dual"=>[], "stepsize"=>[]};
	}
	next if(!/^round\titer/ && !$contents);
	if(/^dual decomposition reduced/){
	    my $numIter = scalar @{$geneRanges{$id}{"primal"}};
	    if($numIter <= 1){ # already optimal cases, no DD required
		$alreadyOpt++;
		$contents=0;
		delete $geneRanges{$id};
		next;
	    }	    
	    my ($idx_max, $max) = findMaxValueIndex(\@{$geneRanges{$id}{"primal"}});
	    my ($idx_min, $min) = findMinValueIndex(\@{$geneRanges{$id}{"dual"}});
		
	    # calculate percentage of initial error
	    my $perc_initial_err = ($min - @{$geneRanges{$id}{"primal"}}[0]) > 0 ? ($min - $max)*100 / ($min - $geneRanges{$id}{"primal"}->[0]) : 0;
	    if($perc_initial_err < 0){ # rounding error
		$perc_initial_err = 0;
	    }
	    # plot dual and primal values against iterations for all gene ranges with an
	    # error greater than threshold t
	    if($perc_initial_err > $t){
		plot_dual_vs_primal($perc_initial_err);
	    }
	    if($geneRanges{$id}{"inconsistencies"} == 0 || $min-$max < 1e-8){ # convergence achieved
		push @conv_iter, $numIter;
		push @conv_best_p, $idx_max;
		$rounds[$geneRanges{$id}{"round"}]++;
	    }
	    else{ # not converged, stores errors 
		push @not_conv_error, $perc_initial_err;
		push @not_conv_iter, $numIter;
		push @not_conv_best_p, $idx_max; 
	    }
	    delete $geneRanges{$id};
	    $contents=0;
	    next;
	}
	if($contents){
	    my @l = split(/\s+/,$_);
	    my ($round, $iteration, $stepsize, $primal, $dual, $inconsist) = ($l[0], $l[1], $l[2], $l[3], $l[4], $l[5]);
	    if(!defined($geneRanges{$id}{"total_iter"})){
		$geneRanges{$id}{"total_iter"}=0;
	    }
	    else{
		$geneRanges{$id}{"total_iter"}++;
	    }
	    $geneRanges{$id}{"round"} = $round;
	    push @{$geneRanges{$id}{"primal"}}, $primal;
	    push @{$geneRanges{$id}{"dual"}}, $dual;
	    push @{$geneRanges{$id}{"stepsize"}}, $stepsize;
	    $geneRanges{$id}{"inconsistencies"}=$inconsist;
	    if(!defined($geneRanges{$id}{"best_primal"}) || $geneRanges{$id}{"best_primal"} < $primal){
		$geneRanges{$id}{"best_primal"} = $primal;
	    }
	    if(!defined($geneRanges{$id}{"best_dual"}) || $geneRanges{$id}{"best_dual"} > $dual){
		$geneRanges{$id}{"best_dual"} = $dual;
	    }
	    my $best_d = $geneRanges{$id}{"best_dual"};
	    my $best_p = $geneRanges{$id}{"best_primal"};
	    my $initial_p = $geneRanges{$id}{"primal"}->[0];
	    my $e = 0;
	    if(($best_d - $initial_p) > 0){
		$e = ($best_d - $best_p)*100 / ($best_d - $initial_p);
	    }
	    if($e < 0){
		$e = 0;  # rounding error
	    }
	    push @{$avgErr[$geneRanges{$id}{"total_iter"}]} , $e;
	}
	$contents=1;
    }
}
my @iter = (@conv_iter, @not_conv_iter);
my @best_p = (@conv_best_p, @not_conv_best_p);

my ($idx_error, $max_error) = (0,0);
if(@not_conv_error){
 ($idx_error, $max_error) = findMaxValueIndex(\@not_conv_error);
}
my @total_error = (@not_conv_error, ((0) x (scalar @conv_iter)));

print "\n$alreadyOpt gene Ranges were discarded, because they were already optimal prior to Dual Decomposition\n\n";
printf("+-------------------------------------+----------+-----------+-----------+----------------------+ \n");
printf("| gene Ranges      |        No.       | avg iter | avg error | max error | avg iter best primal |\n");
printf("+-------------------------------------+----------+-----------+-----------+----------------------+\n");
printf("| e-convergence    | %6u (%6.2f%%) |   %4u   |  %5.2f%%   |  %5.2f%%   |        %4u          |\n", scalar @conv_iter, (scalar @conv_iter *100 / scalar @iter),avg(\@conv_iter),0,0,avg(\@conv_best_p));
printf("| no e-convergence | %6u (%6.2f%%) |   %4u   |  %5.2f%%   |  %5.2f%%   |        %4u          |\n", scalar @not_conv_iter, (scalar @not_conv_iter *100 / scalar @iter), avg(\@not_conv_iter), avg(\@not_conv_error), $max_error, avg(\@not_conv_best_p));
printf("+-------------------------------------+----------+-----------+-----------+----------------------+\n");
printf("| total            | %6u (%6.2f%%) |   %4u   |  %5.2f%%   |  %5.2f%%   |        %4u          |\n",  scalar @iter, 100, avg(\@iter), avg(\@total_error), $max_error, avg(\@best_p));
printf("+-------------------------------------+----------+-----------+-----------+----------------------+\n\n");
print "No. of e-convergences per round of Dual Decomposition\n\n";
foreach my $i (0 .. $#rounds) {
    print "round $i - $rounds[$i]\n";
}
print "\nIf after n rounds the No. of e-convergences is only very small, the rounds\n";
print "can be restricted to n in the next run.\n\n";
    
# plot histogram of iterations
if(defined($hist_iter)){
    my $R = Statistics::R->new();
    $R->set('font',"Helvetica");
    $R->set('filename', $outdir . $hist_iter);
    $R->set('data',\@iter);
    $R->run(q`(if(require(extrafont)){library(extrafont); font <- "LM Roman 10"})`);
    $R->run(q`pdf(filename, family=font)`);
    $R->run(q`par(cex.main=1.5, cex.lab=1.5, cex.axis=1.5, cex=1)`);
    $R->run(q`hist(data,breaks=50,col="red",xlab="Iterations", main="")`);
    $R->run(q`dev.off()`);
}

# plot histogram of errors
if(defined($hist_err) && @not_conv_error){
    my $R = Statistics::R->new();
    $R->set('filename', $outdir . $hist_err);
    $R->set('font',"Helvetica");
    $R->set('data',\@not_conv_error);
    $R->run(q`(if(require(extrafont)){library(extrafont); font <- "LM Roman 10"})`);
    $R->run(q`pdf(filename, family=font)`);
    $R->run(q`par(cex.main=1.5, cex.lab=1.5, cex.axis=1.5, cex=1)`);
    $R->run(q`hist(data,breaks=50,col="red",xlab="% of initial error", main="")`);
    $R->run(q`dev.off()`);
}

# plot average percentage of initial error against iterations
if(defined($err_per_iter)){
    my $n = scalar @{$avgErr[0]};
    my @avgs = ();
    for my $i (@avgErr){ # compute average error per iteration
	my $avg = sum($i) / $n;
	push @avgs, $avg;
	if( scalar @avgs > 1  && ($avgs[$#avgs-1] - $avgs[$#avgs]  < 0.00001)){
	    last;
	}
    }
    my @x = (0..$#avgs);

    my $R = Statistics::R->new();
    $R->set('font',"Helvetica");
    $R->set('avgs',\@avgs);
    $R->set('iter',\@x);
    $R->set('filename', $outdir . $err_per_iter);
    $R->run(q`(if(require(extrafont)){library(extrafont); font <- "LM Roman 10"})`);
    $R->run(q`pdf(filename, family=font)`);
    $R->run(q`plot(iter,avgs, type="l", lwd=2, col="blue", ylim=c(min(avgs), max(avgs)), xlim=c(0,max(iter)), xlab=expression(paste("iteration ",italic(t))), ylab="average % of initial error", cex.axis=2, cex.lab=2, cex.main=2)`);
    $R->run(q`dev.off()`);	    
}

# plot dual and primal values against iterations
sub plot_dual_vs_primal{
    my $err = shift;
    my $filename = $outdir . "gr_" . $id . ".pdf";
    my @x = (0..$geneRanges{$id}{"total_iter"});
    my $R = Statistics::R->new();
    $R->set('font',"Helvetica");
    $R->set('iter',\@x);
    $R->set('primal',\@{$geneRanges{$id}{"primal"}});
    $R->set('dual',\@{$geneRanges{$id}{"dual"}});
    
    $R->set('filename', $filename);
    $R->set('error', $err);
    $R->run(q`(if(require(extrafont)){library(extrafont); font <- "LM Roman 10"})`);
    $R->run(q`pdf(filename, family=font)`);
    $R->run(q`plot(iter,primal, type="l", lwd=2, col="blue", ylim=c(min(primal), max(dual)), xlab=expression(paste("iteration ",italic(t))), ylab="value", cex.axis=2, cex.lab=2, main=bquote("Percentage of initial error" ~ italic(epsilon) == .(error) ~ "%"), cex.main=2)`);
    $R->run(q`lines(iter,dual, type="l", lwd=2, col="red")`);
    $R->run(q`legend("bottomright",c(expression(paste("current primal ",italic(p^t))),expression(paste("current dual ",italic(d^t)))),col=c("blue", "red"), lwd=c(2,2), cex=2, bty="n")`);
    $R->run(q`points(iter[which.max(primal)], max(primal), cex = 1.5, pch = 20)`);
    $R->run(q`text(iter[which.max(primal)], max(primal), labels = expression(italic(p)[best]), cex= 1.5, pos=3)`);
    $R->run(q`dev.off()`);	
}

sub findMaxValueIndex{
    my $idx;
    my $max;
    my @array = @{$_[0]};
    for my $i (0 .. $#array){
	if(!defined($max) || $max < $array[$i]){
	    $idx = $i;
	    $max = $array[$i];
	}
    }
    return ($idx, $max);
}

sub findMinValueIndex{
    my $idx;
    my $min;
    my @array = @{$_[0]};
    for my $i (0 .. $#array){
	if(!defined($min) || $min > $array[$i]){
	    $idx = $i;
	    $min = $array[$i];
	}
    }
    return ($idx, $min);
}

sub avg{
    my @array = @{$_[0]};
    if(@array){
	my $sum = sum(\@array);
	$sum /= scalar @array;
	return $sum;
    }
    return 0;
}

sub sum{
    my $sum = 0;
    my @array = @{$_[0]};
    for my $i (@array){
	$sum+=$i;
    }
    return $sum;
}