File: freq-filter

package info (click to toggle)

norwegian 2.0.10-2

links: PTS
area: main
in suites: lenny
size: 26,296 kB
ctags: 178
sloc: perl: 2,665; makefile: 1,724; sh: 206

file content (94 lines) | stat: -rwxr-xr-x 2,228 bytes

parent folder | download | duplicates (7)

#!/usr/bin/perl
#
# Author:  Petter Reinholdtsen
# Date:    2006-02-01
# License: GNU General Public License
#
# Reads a munchfile and an list of words expanded from the same
# munchfile, and only accept the munched entries with one word with
# frequency value over the given threshold.
#
# Example of use:
#   munchfile < words > munchfile
#      contain foo/AB\n
#   ./expndflg.pl munchfile | sort  '-t/' -u +0f -1 +0 > munchlist
#      contain foo/\nfoo/A\nfoo/B\n
#   ispell -e < munchlist > munchwordfile
#   ./freq-filter munchfile munchwordfile freqfile
#   [join all flags]
#
# The freqfile format is one line per word, with each line consisting
# of '<freq> <word>' separated by one space.

use strict;
use warnings;

use Getopt::Std;

my %opts;
getopts('dl:', \%opts) || usage();
usage() unless (3 == @ARGV);

my $threshold = $opts{'l'} || 0;
my $debug = $opts{'d'} || 0;

my %wordfreq;
my %munchfreq;

my $munchfile = $ARGV[0];
my $wordfile  = $ARGV[1];
my $freqfile  = $ARGV[2];

load_wordfreq($freqfile);

open(MUNCH, "<$munchfile") or die "Unable to read from $munchfile";
open(WORDS, "<$wordfile") or die "Unable to read from $wordfile";
while (my $munch = <MUNCH>) {
    chomp $munch;
    my @words = split(/\s+/, <WORDS>);

    my $freq = -1;
    for my $word (@words) {
	$freq = $wordfreq{$word} if (exists $wordfreq{$word} &&
				     $freq < $wordfreq{$word});
    }
    $munchfreq{$munch} = $freq;
    
    print STDERR "$munch @words\n" if $debug;
}
close WORDS;
close MUNCH;

for my $munch (sort keys %munchfreq) {
    print "$munch\n" if ($munchfreq{$munch} > $threshold);
}

sub usage {
    print <<EOF;
Usage: $0 [-d] [-l <threshold>] munched munchword freqlist

  -d             Enable debug output
  -l <threshold> Change threshold value from 0 to another value.
                 Valid values are from 0 and up.
EOF
    exit 1;
}

sub load_wordfreq {
    my $filename = shift;
    open(F, "<$filename") or die "Unable to read '$filename'";
    while (<F>) {
	chomp;
	my ($count, $word) = split;
	next if ($word =~ m/^\d+$/); # Ignore numbers
	my $f;
	if ($count<=5) {
	    $f=$count;
	} else {
	    $f = -9 + 15 * log(1+log($count));
	}
	$wordfreq{$word} = sprintf("%.0f", $f);
    }
    close F;
}