1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
|
#!/usr/bin/perl
#
# Author: Petter Reinholdtsen
# Date: 2006-02-01
# License: GNU General Public License
use warnings;
use strict;
my $missinglimit = 20;
my %wordfreq;
my %seen;
load_wordfreq($ARGV[0]);
update_freq($ARGV[1]);
report_missing();
exit 0;
sub load_wordfreq {
my $filename = shift;
open(F, "<$filename") or die "Unable to read '$filename'";
while (<F>) {
chomp;
my ($count, $word) = split;
next if ($word =~ m/^\d+$/); # Ignore numbers
my $f;
if ($count<=5) {
$f=$count;
} else {
$f = -9 + 15 * log(1+log($count));
}
$wordfreq{$word} = sprintf("%.0f", $f);
}
close F;
}
sub update_freq {
my $filename = shift;
open (F, "<$filename") or die "Unable to read '$filename'";
while (<F>) {
chomp;
if (/^\#/) {
print "$_\n";
next;
}
my ($word, $rest) = ("", "0");
if (m/^(\S+) (.*)$/) {
($word, $rest) = ($1,$2);
my $f = "0";
my $c;
$c = $f = $1 if ($rest =~ m/(\d+) ?/);
my $shortword = lc($word);
$shortword =~ s/-//g;
$seen{$shortword} = 1;
if (exists $wordfreq{lc($word)}) {
$c = $wordfreq{lc($word)};
} elsif (exists $wordfreq{$shortword}) {
$c = $wordfreq{$shortword};
}
$rest =~ s/$f/$c/ if (0 == $f);
}
print "$word $rest\n";
$seen{lc($word)} = 1;
}
close F;
}
sub report_missing {
for my $word (keys %wordfreq) {
if ($wordfreq{$word} > $missinglimit && ! exists $seen{lc($word)}) {
print STDERR "Missing word '$word' have high frequency.\n";
}
}
}
|