File: lexbuild

package info (click to toggle)
gramadoir 0.7-6
  • links: PTS
  • area: main
  • in suites: forky, sid
  • size: 12,628 kB
  • sloc: perl: 11,207; sh: 2,973; xml: 462; lisp: 196; makefile: 94; yacc: 63; lex: 62; ansic: 26; sed: 16
file content (38 lines) | stat: -rw-r--r-- 1,090 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/perl
# Copyright (C) 2004 Kevin P. Scannell
# This is free software; see the file COPYING for copying conditions.  There is
# NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
#   Reads a plain text lexicon from standard input;
#   words with frequency 0 and 1 have been taken out and dealt with already.
#   Processes these words and writes 5 files of approximately the same size:
#   lexicon0.txt through lexicon4.txt ranging from highest to lowest freq.

use strict;

my @everything;
binmode STDIN, ":bytes";
while (<STDIN>) {
	push @everything, $_;
}

my $linenum = 0;
my $stop = $linenum + @everything/5;
my $filenum = 0;
my $prev;
open (OUTSTREAM, ">:bytes", "lexicon$filenum.txt") or die "Could not open output file: $!\n";
foreach (@everything) {
	$linenum++;
	if ($linenum > $stop) {
		$prev =~ s/ .*/ /s;
		unless (m/^$prev/) {
			close OUTSTREAM;
			$filenum++;
			open (OUTSTREAM, ">:bytes", "lexicon$filenum.txt") or die "Could not open output file: $!\n";
			$stop += @everything/5;
		}
	}
	print OUTSTREAM $_;
	$prev = $_;
}
exit 0;