File: freq-update

package info (click to toggle)
norwegian 2.0.10-2
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 26,296 kB
  • ctags: 178
  • sloc: perl: 2,665; makefile: 1,724; sh: 206
file content (75 lines) | stat: -rwxr-xr-x 1,523 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/perl
#
# Author:  Petter Reinholdtsen
# Date:    2006-02-01
# License: GNU General Public License

use warnings;
use strict;

my $missinglimit = 20;

my %wordfreq;
my %seen;

load_wordfreq($ARGV[0]);
update_freq($ARGV[1]);
report_missing();
exit 0;

sub load_wordfreq {
    my $filename = shift;
    open(F, "<$filename") or die "Unable to read '$filename'";
    while (<F>) {
	chomp;
	my ($count, $word) = split;
	next if ($word =~ m/^\d+$/); # Ignore numbers
	my $f;
	if ($count<=5) {
	    $f=$count;
	} else {
	    $f = -9 + 15 * log(1+log($count));
	}
	$wordfreq{$word} = sprintf("%.0f", $f);
    }
    close F;
}

sub update_freq {
    my $filename = shift;
    open (F, "<$filename") or die "Unable to read '$filename'";
    while (<F>) {
	chomp;
	if (/^\#/) {
	    print "$_\n";
	    next;
	}
	my ($word, $rest) = ("", "0");
	if (m/^(\S+) (.*)$/) {
	    ($word, $rest) = ($1,$2);
	    my $f = "0";
	    my $c;
	    $c = $f = $1 if ($rest =~ m/(\d+) ?/);
	    my $shortword = lc($word);
	    $shortword =~ s/-//g;
	    $seen{$shortword} = 1;
	    if (exists $wordfreq{lc($word)}) {
		$c = $wordfreq{lc($word)};
	    } elsif (exists $wordfreq{$shortword}) {
		$c = $wordfreq{$shortword};
	    }
	    $rest =~ s/$f/$c/ if (0 == $f);
	}
	print "$word $rest\n";
	$seen{lc($word)} = 1;
    }
    close F;
}

sub report_missing {
    for my $word (keys %wordfreq) {
	if ($wordfreq{$word} > $missinglimit && ! exists $seen{lc($word)}) {
	    print STDERR "Missing word '$word' have high frequency.\n";
	}
    }
}