File: speling-extract-synonyms

package info (click to toggle)

norwegian 2.0.10-2

links: PTS
area: main
in suites: lenny
size: 26,296 kB
ctags: 178
sloc: perl: 2,665; makefile: 1,724; sh: 206

file content (86 lines) | stat: -rwxr-xr-x 2,300 bytes

parent folder | download | duplicates (7)

#!/usr/bin/perl
#
# Author:  Petter Reinholdtsen
# Date:    2005-12-02
# License: GNU General Public License
#
# speling-extract-synonyms extract synonyms usable for OOo from the
# raw data files produced by www.speling.org software.
#
# Usage:
#   echo abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ- > thesaurus-ll.txt
#   speling-extract-synonyms < source >> thesaurus-ll.txt
#   thescoder thesaurus-ll.txt ll_CC

use warnings;
use strict;

# The difference between approvals and rejections need to be higher
# than this value.
my $acceptlimit = 0;

my $debug = 0;

my %synonym;
my %words;

my $word;
while (<>) {
    chomp;
    $word = $1 if /^WORD: (.+)$/;

    if (/^STATUS: \+$/) {
	$words{$word}++;
    } elsif (/^STATUS: -$/) {
	$words{$word} -= 2;
    } elsif (/^STATUS: \?$/) {
	$words{$word} -= 0.5;
    }
    if (defined $word && /^SYNONYMS?: (.+)$/) {
        my @synonyms = split(/\s*,\s*/, $1);
        if (exists $synonym{$word}) {
            print "Adding @synonyms for '$word'\n" if $debug;
            push(@{$synonym{$word}}, @synonyms);
        } else {
            print "Inserting @synonyms for '$word'\n" if $debug;
            @{$synonym{$word}} = @synonyms;
        }
    }
}

# Only accepts words seen in the word database, and with more approves
# than rejects.
sub uncontroversial_word {
    my ($word, $base) = @_;
    if (exists $words{$word} && $acceptlimit < $words{$word}) {
	return 1;
    } else {
	my $wordval = $words{$word} || "[not known]";
	print STDERR "Ignoring controversial word '$word' ($wordval) from '$base'\n";
	return 0;
    }
}
 
for my $keyword (sort keys %synonym) {

    # Ignore controversial words
    next unless uncontroversial_word($keyword, $keyword);

    my $lastsynonym = "";
    my $symlist = "";
  SYNONYM:
    for my $synonym (sort @{$synonym{$keyword}}) {
	# Do not accept the word ifself as a synonym
	next if ($keyword eq $synonym);

	# Make sure all the words in the synonym phrase is
	# uncontroversial
	for my $word (split(/\s+/, $synonym)) {
	    # Only accept words with more accepts than rejects
	    next SYNONYM unless uncontroversial_word($word, $keyword);
	}
        $symlist .= "; $synonym" if ($synonym ne $lastsynonym);
        $lastsynonym = $synonym;
    }
    print "$keyword$symlist\n" if $symlist;
}