File: thes_to_dat

package info (click to toggle)

norwegian 2.2-4

links: PTS
area: main
in suites: bookworm, bullseye, buster, forky, sid, trixie
size: 26,448 kB
sloc: perl: 2,695; makefile: 1,678; sh: 209

file content (62 lines) | stat: -rw-r--r-- 1,633 bytes

parent folder | download | duplicates (2)

#!/usr/bin/perl -w

# Copyright (C) 2005 Enrico Zini <enrico@debian.org>
#
# This code is released under the public domain.
#
# Converts the thesaurus.txt from the Italian thesaurus project into
# the .dat file for OpenOffice.org 2
#
# The description of the input format can be found at:
#
# http://rpms.alerque.com/BUILD/ooo-build-1.9.104/build/src680-m104/lingucomponent/source/thesaurus/mythes/data_layout.txt

# Modified 2007-05-15 by Petter Reinholdtsen to use ; instead of , as
# separator, for the spell-norwegian package.
# Modified 2008-03-09 by Petter Reinholdtsen to list all words in a
# word group individually, with the other words as synonyms.

use strict;
use warnings;
# Not using locale specific sorting, to avoid build issues on machines
# without the requested locale.
#use locale;

# Skip the first two lines
<>;
<>;

# Print the charset
my %entries;

# Convert the rest
while (<>)
{
        # Fix newlines in whatever encoding
        s/[\r\n]+$//;
        my @line = split(/\s*;\s*/);
        next if @line < 2;
        for my $word (sort @line) {
            my @syms;
            for my $sym (@line) {
                push(@syms, $sym) unless ($word eq $sym);
            }
            if (exists $entries{$word}) {
                push(@{$entries{$word}}, \@syms);
            } else {
                $entries{$word} = [\@syms];
            }
        }
}

print "ISO8859-1\n";

for my $word (sort keys %entries) {
    my @groups = @{$entries{$word}};
    printf("%s|%d\n", $word, scalar @groups);
    for my $group (@groups) {
        print "-|", join('|', @{$group}), "\n";
    }
}

# vim:set ts=4 sw=4: