1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
|
#!/usr/bin/perl
# Copyright (C) 2004 Kevin P. Scannell
# This is free software; see the file COPYING for copying conditions. There is
# NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Reads a plain text lexicon from standard input;
# words with frequency 0 and 1 have been taken out and dealt with already.
# Processes these words and writes 5 files of approximately the same size:
# lexicon0.txt through lexicon4.txt ranging from highest to lowest freq.
use strict;
my @everything;
binmode STDIN, ":bytes";
while (<STDIN>) {
push @everything, $_;
}
my $linenum = 0;
my $stop = $linenum + @everything/5;
my $filenum = 0;
my $prev;
open (OUTSTREAM, ">:bytes", "lexicon$filenum.txt") or die "Could not open output file: $!\n";
foreach (@everything) {
$linenum++;
if ($linenum > $stop) {
$prev =~ s/ .*/ /s;
unless (m/^$prev/) {
close OUTSTREAM;
$filenum++;
open (OUTSTREAM, ">:bytes", "lexicon$filenum.txt") or die "Could not open output file: $!\n";
$stop += @everything/5;
}
}
print OUTSTREAM $_;
$prev = $_;
}
exit 0;
|