File: bootstrap_resampling.pl

package info (click to toggle)
apertium-eval-translator 1.2.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 188 kB
  • sloc: perl: 858; sh: 162; makefile: 7
file content (244 lines) | stat: -rwxr-xr-x 5,758 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/perl -w
use utf8;

# (c) 2007 Felipe Sánchez Martínez
# (c) 2007 Universitat d'Alacant
#
# This software is licensed under the GPL license version 3, or at
# your option any later version
#

use strict;
use warnings;

# Getting command line arguments:
use Getopt::Long;
# Documentation:
use Pod::Usage;
# I/O Handler
use IO::Handle;

use Math::Random::OO::Bootstrap;

#use locale;
#use POSIX qw(locale_h);
#setlocale(LC_ALL,"");

my($source, $test, $ref, $help, $times, $evalscript);

# Command line arguments
GetOptions( 'source|s=s'         => \$source,
            'test|t=s'           => \$test,
            'ref|r=s'            => \$ref,
            'times|n=n'          => \$times,
            'eval|e=s'           => \$evalscript,
            'help|h'             => \$help,
          ) || pod2usage(2);


pod2usage(2) if $help;
pod2usage(2) unless ($source);
pod2usage(2) unless ($test);
pod2usage(2) unless ($ref);
pod2usage(2) unless ($times);
pod2usage(2) unless ($evalscript);

open(SRC, "<$source") or die "Error: Cannot open source file \'$source\': $!\n";
open(TEST, "<$test") or die "Error: Cannot open test file \'$test\': $!\n";
open(REF, "<$ref") or die "Error: Cannot open reference file \'$ref\': $!\n";

print "Source file: '$source'\n";
print "Test file: '$test'\n";
print "Reference file '$ref'\n";
print "Eval script '$evalscript'\n";
print "Number of times '$times'\n\n";

my(@src_corpus, @test_corpus, @ref_corpus);

while(<TEST>) {
  &preprocess;
#  s/[*](\w+)/$1/g;
  push @test_corpus, $_;

  $_=<REF>;
  &preprocess;
#  s/[*](\w+)/$1/g;
  push @ref_corpus, $_;

  $_=<SRC>;
  &preprocess;
#  s/[*](\w+)/$1/g;
  push @src_corpus, $_;

}
close(SRC);
close(TEST);
close(REF);

if ($#test_corpus != $#ref_corpus) {
  print STDERR "Error: Test file has ", $#test_corpus+1, " sentences while reference file has ", $#ref_corpus+1, "\n";
  exit(1);
}

if ($#test_corpus != $#src_corpus) {
  print STDERR "Error: Test file has ", $#test_corpus+1, " sentences while source file has ", $#src_corpus+1, "\n";
  exit(1);
}

print "Number of samples (sentences): ",  $#test_corpus+1, "\n";

#Initialize the bootstrap resampling with replacement random numbers generator
my @sample=(0..$#test_corpus);
my $boots = Math::Random::OO::Bootstrap->new(@sample);
$boots->seed(0.42);

my @scores;
print "Perfoming bootstrap resampling ";
foreach (1..$times) {
  print ".";
  my @sampleset=&next_sample_set;
  push @scores, &eval_sample_set(@sampleset);
  #print "Test $_: ",$scores[$#scores], "\n";
}
print " done.\n";

my @sorted_scores = sort { $a <=> $b } @scores;

#foreach(0..$#sorted_scores) {
#print $sorted_scores[$_], "\n";
#}

&confidence(0.95, @sorted_scores);
&confidence(0.85, @sorted_scores);
&confidence(0.75, @sorted_scores);
&confidence(0.65, @sorted_scores);
&confidence(0.50, @sorted_scores);

##########################################################################

sub confidence {
  my ($conf, @scores)=@_;

  #foreach(0..$#scores) {
  #  print $scores[$_], "\n";
  #}

  my $drop=&round((1.0-$conf)/2.0*$times);

  print "\n--- Confidence: $conf ---\n";
  print "Removing the top $drop and bottom $drop scores ... ";
  foreach (1..$drop) {
    shift @scores;
  }

  foreach (1..$drop) {
    pop @scores;
  }
  print " done.\n";

  my($min,$max);
  $min=$scores[0];
  $max=$scores[$#scores];

  print &mean(@scores), " in [ ", $min, " , ",  $max, " ]\n";

  print "Score: ", ($min+(($max-$min)/2.0)), " +/- ", (($max-$min)/2.0), "\n";
}

sub next_sample_set {
  my @sampleset;

  foreach (0..$#sample) {
    push @sampleset, $boots->next();
  }
  return @sampleset;
}

sub eval_sample_set {
  my (@sampleset)=@_;

  #Prepare source file
  open(TMP, ">/tmp/source_file-$$") or die "Error: Cannot open file \'/tmp/source_file-$$\': $!\n";
  foreach (@sampleset) {
    print TMP $src_corpus[$_], "\n";
  }
  close(TMP);

  #Prepare test file
  open(TMP, ">/tmp/test_file-$$") or die "Error: Cannot open file \'/tmp/test_file-$$\': $!\n";
  foreach (@sampleset) {
    print TMP $test_corpus[$_], "\n";
  }
  close(TMP);

  #Prepare reference file
  open(TMP, ">/tmp/reference_file-$$") or die "Error: Cannot open file \'/tmp/reference_file-$$\': $!\n";
  foreach (@sampleset) {
    print TMP $ref_corpus[$_], "\n";
  }
  close(TMP);

  #Execution of the evaluation script
  my $output=`$evalscript /tmp/source_file-$$ /tmp/reference_file-$$ /tmp/test_file-$$`;
  chomp $output;

  $output =~ tr/,/./;
  #print STDERR $output, "\n";

  `rm /tmp/source_file-$$ /tmp/reference_file-$$ /tmp/test_file-$$`;

  return $output;
}

sub round {
  my($number) = @_;
  return int($number + 0.5 * ($number <=> 0));
}

sub mean {
  my(@v) = @_;
  my $sum=0.0;

  foreach (@v) {
    $sum+=$_;
  }

  return $sum/($#v+1);
}


sub preprocess {
  chomp;
  #Insert spaces before and after  punctuation marks
  #s/([.,;:%¿?¡!()\[\]{}<>])/ $1 /g;
}


__END__

=head1 NAME

=head1 SYNOPSIS

mteval_by_bootstrap_resampling.pl -source srcfile -test testfile -ref
reffile -times <n> -eval /full/path/to/eval/script

Options:

  -source|-s   Specify the file with the source file
  -test|-t     Specify the file with the translations to evaluate
  -ref|-r      Specify the file with the reference translations
  -times|-n    Specify how many times the resampling should be done
  -eval|-e     Specify the full path to the MT evaluation script
  -help|-h     Show this help message

Note: Reference translation MUST have no unknown-word marks, even if
      they are free rides.

(c) 2007 Felipe Sánchez Martínez
(c) 2007 Universitat d'Alacant

This software is licensed under the GNU GENERAL PUBLIC LICENSE version
2, or at your option any latter version. See
http://www.gnu.org/copyleft/gpl.html for a complete version of the
license.