1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
|
#!/usr/bin/perl -w
# Copyright (C) 2005 Enrico Zini <enrico@debian.org>
#
# This code is released under the public domain.
#
# Converts the thesaurus.txt from the Italian thesaurus project into
# the .dat file for OpenOffice.org 2
#
# The description of the input format can be found at:
#
# http://rpms.alerque.com/BUILD/ooo-build-1.9.104/build/src680-m104/lingucomponent/source/thesaurus/mythes/data_layout.txt
# Modified 2007-05-15 by Petter Reinholdtsen to use ; instead of , as
# separator, for the spell-norwegian package.
# Modified 2008-03-09 by Petter Reinholdtsen to list all words in a
# word group individually, with the other words as synonyms.
use strict;
use warnings;
# Not using locale specific sorting, to avoid build issues on machines
# without the requested locale.
#use locale;
# Skip the first two lines
<>;
<>;
# Print the charset
my %entries;
# Convert the rest
while (<>)
{
# Fix newlines in whatever encoding
s/[\r\n]+$//;
my @line = split(/\s*;\s*/);
next if @line < 2;
for my $word (sort @line) {
my @syms;
for my $sym (@line) {
push(@syms, $sym) unless ($word eq $sym);
}
if (exists $entries{$word}) {
push(@{$entries{$word}}, \@syms);
} else {
$entries{$word} = [\@syms];
}
}
}
print "ISO8859-1\n";
for my $word (sort keys %entries) {
my @groups = @{$entries{$word}};
printf("%s|%d\n", $word, scalar @groups);
for my $group (@groups) {
print "-|", join('|', @{$group}), "\n";
}
}
# vim:set ts=4 sw=4:
|