File: speling-extract-synonyms

package info (click to toggle)
norwegian 2.0.10-2
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 26,296 kB
  • ctags: 178
  • sloc: perl: 2,665; makefile: 1,724; sh: 206
file content (86 lines) | stat: -rwxr-xr-x 2,300 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/perl
#
# Author:  Petter Reinholdtsen
# Date:    2005-12-02
# License: GNU General Public License
#
# speling-extract-synonyms extract synonyms usable for OOo from the
# raw data files produced by www.speling.org software.
#
# Usage:
#   echo abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ- > thesaurus-ll.txt
#   speling-extract-synonyms < source >> thesaurus-ll.txt
#   thescoder thesaurus-ll.txt ll_CC

use warnings;
use strict;

# The difference between approvals and rejections need to be higher
# than this value.
my $acceptlimit = 0;

my $debug = 0;

my %synonym;
my %words;

my $word;
while (<>) {
    chomp;
    $word = $1 if /^WORD: (.+)$/;

    if (/^STATUS: \+$/) {
	$words{$word}++;
    } elsif (/^STATUS: -$/) {
	$words{$word} -= 2;
    } elsif (/^STATUS: \?$/) {
	$words{$word} -= 0.5;
    }
    if (defined $word && /^SYNONYMS?: (.+)$/) {
        my @synonyms = split(/\s*,\s*/, $1);
        if (exists $synonym{$word}) {
            print "Adding @synonyms for '$word'\n" if $debug;
            push(@{$synonym{$word}}, @synonyms);
        } else {
            print "Inserting @synonyms for '$word'\n" if $debug;
            @{$synonym{$word}} = @synonyms;
        }
    }
}

# Only accepts words seen in the word database, and with more approves
# than rejects.
sub uncontroversial_word {
    my ($word, $base) = @_;
    if (exists $words{$word} && $acceptlimit < $words{$word}) {
	return 1;
    } else {
	my $wordval = $words{$word} || "[not known]";
	print STDERR "Ignoring controversial word '$word' ($wordval) from '$base'\n";
	return 0;
    }
}
 
for my $keyword (sort keys %synonym) {

    # Ignore controversial words
    next unless uncontroversial_word($keyword, $keyword);

    my $lastsynonym = "";
    my $symlist = "";
  SYNONYM:
    for my $synonym (sort @{$synonym{$keyword}}) {
	# Do not accept the word ifself as a synonym
	next if ($keyword eq $synonym);

	# Make sure all the words in the synonym phrase is
	# uncontroversial
	for my $word (split(/\s+/, $synonym)) {
	    # Only accept words with more accepts than rejects
	    next SYNONYM unless uncontroversial_word($word, $keyword);
	}
        $symlist .= "; $synonym" if ($synonym ne $lastsynonym);
        $lastsynonym = $synonym;
    }
    print "$keyword$symlist\n" if $symlist;
}