File: freq-filter

package info (click to toggle)
norwegian 2.0.10-2
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 26,296 kB
  • ctags: 178
  • sloc: perl: 2,665; makefile: 1,724; sh: 206
file content (94 lines) | stat: -rwxr-xr-x 2,228 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/perl
#
# Author:  Petter Reinholdtsen
# Date:    2006-02-01
# License: GNU General Public License
#
# Reads a munchfile and an list of words expanded from the same
# munchfile, and only accept the munched entries with one word with
# frequency value over the given threshold.
#
# Example of use:
#   munchfile < words > munchfile
#      contain foo/AB\n
#   ./expndflg.pl munchfile | sort  '-t/' -u +0f -1 +0 > munchlist
#      contain foo/\nfoo/A\nfoo/B\n
#   ispell -e < munchlist > munchwordfile
#   ./freq-filter munchfile munchwordfile freqfile
#   [join all flags]
#
# The freqfile format is one line per word, with each line consisting
# of '<freq> <word>' separated by one space.

use strict;
use warnings;

use Getopt::Std;

my %opts;
getopts('dl:', \%opts) || usage();
usage() unless (3 == @ARGV);

my $threshold = $opts{'l'} || 0;
my $debug = $opts{'d'} || 0;

my %wordfreq;
my %munchfreq;

my $munchfile = $ARGV[0];
my $wordfile  = $ARGV[1];
my $freqfile  = $ARGV[2];

load_wordfreq($freqfile);

open(MUNCH, "<$munchfile") or die "Unable to read from $munchfile";
open(WORDS, "<$wordfile") or die "Unable to read from $wordfile";
while (my $munch = <MUNCH>) {
    chomp $munch;
    my @words = split(/\s+/, <WORDS>);

    my $freq = -1;
    for my $word (@words) {
	$freq = $wordfreq{$word} if (exists $wordfreq{$word} &&
				     $freq < $wordfreq{$word});
    }
    $munchfreq{$munch} = $freq;
    
    print STDERR "$munch @words\n" if $debug;
}
close WORDS;
close MUNCH;

for my $munch (sort keys %munchfreq) {
    print "$munch\n" if ($munchfreq{$munch} > $threshold);
}

sub usage {
    print <<EOF;
Usage: $0 [-d] [-l <threshold>] munched munchword freqlist

  -d             Enable debug output
  -l <threshold> Change threshold value from 0 to another value.
                 Valid values are from 0 and up.
EOF
    exit 1;
}

sub load_wordfreq {
    my $filename = shift;
    open(F, "<$filename") or die "Unable to read '$filename'";
    while (<F>) {
	chomp;
	my ($count, $word) = split;
	next if ($word =~ m/^\d+$/); # Ignore numbers
	my $f;
	if ($count<=5) {
	    $f=$count;
	} else {
	    $f = -9 + 15 * log(1+log($count));
	}
	$wordfreq{$word} = sprintf("%.0f", $f);
    }
    close F;
}