File: cedictmerge

package info (click to toggle)
cedicttools 1.1-3
  • links: PTS
  • area: contrib
  • in suites: potato, woody
  • size: 180 kB
  • ctags: 52
  • sloc: perl: 1,234; lisp: 109; makefile: 43; sh: 28
file content (344 lines) | stat: -rwxr-xr-x 11,030 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
#!/usr/bin/perl -w
#
# Copyright (c) 1998  David Hiebeler
# For licensing information, see the "printLicense" function
# down around line 85.
#
# File: cedictmerge, version 1.1
#   By: David Hiebeler
#       Center for Applied Math
#       Cornell University
#       Ithaca, NY 14853
#       hiebeler@cam.cornell.edu
#       http://www.cam.cornell.edu/hiebeler/home.html
#
#       Version 1.1: December 1998
#       Version 1.0: July 1998
#
#
# This is a perl script for merging two CEDICT-format files (see
# "http://www.mindspring.com/~paul_denisowski/cedict.html" for
# information about CEDICT).
#
# Usage: cedictmerge [-o outFile] [-nodup] file1 file2 [file3 file4 ... fileN]
#
# Consider file1 to be the "original" or "main" file, and the other
# files will be merged into it.  Consider file2 being merged in.
# If an entry in file2 is not in file1, then it will be appended to
# the end of file1's data (however, file1 will not be changed -- the
# merged data will be written to outFile if specified, otherwise stdout).
# If an entry in file2 is in file1, then it will still be appended to
# the end of the merged file, but with the special field "/!!!!!/" appended
# to its English definition (so that one can easily find such entries
# using an editor later, and manually merge duplicate entries).
#
# If the "-nodup" flag is specified, duplicate entries from file2 will not
# appended to the output data.
#
# You can use the "-nma x" argument to se the "NeutralMatchesAny" flag.
# The value "x" should be either 0 or 1.  If you use 1, it means a neutral
# tone (i.e. tone 5) matches any tone.  This is because it's a pretty
# common mistake (at least for me) to put the "intrinsic tone" in the pinyin
# field for a character, if I don't realize the character's tone becomes
# neutral in that particular word.  This helps catch such mistakes.
#
# Use can use the "-uu2u:" command-line argument to turn pinyin entries
# like "nuu3" into "nu:3", and the "-u:2uu" argument to do the opposite,
# i.e. turn "nu:3" into "nuu3".  (This feature is available because
# both forms have appeared in various versions of CEDICT).
#
# This script should work correctly on both GB and BIG5 files.
#
# Note that this script will exit if it encounters any lines not
# in cedict format, with the following exception: it will ignore (and
# discard) any blank lines, and discard any comments which begin
# with '#' (whether the comment is the only thing on a line, or at the
# end of a line).  You may want to use the "cedictcheckformat" script
# first to catch any lines in your vocabulary file which are not in strict
# CEDICT format.  You may also want to use "cedictsort" after merging,
# to sort the results.
#
# Also note that this script checks each new word against the list of
# all words it has seen so far.  In particular, this means it will catch
# duplicate entries within a single file (even the first file), not only
# duplicates between different files.
#
# Wishlist:
# o) Allow the option of ignoring the pinyin field, since then it could
#    catch entries which have mistakes in the pinyin (other than just
#    mistakes about characters changing to neutral tone).  It would probably
#    be best to do this only for multi-character words, so that it wouldn't
#    flag all of the single characters which have multiple pronunciations.
#
# History:
#   10 Dec 1998: added code to turn "uu" into "u:" or vice-versa in the
#      pinyin field if the user requests it, to handle the fact that both
#      forms have been present in cedict for some time now.
#   29 July 1998: original version, 1.0


# Define a couple of constants
$uu2uc = 1;
$uc2uu = 2;


sub printLicense {
    print <<"END_OF_LICENSE";
cedictsort version 1.1   June 10, 1999
Copyright (C) 1998,1999  David Hiebeler
                         Center for Applied Math
                         Cornell University
                         Ithaca, NY 14853
                         hiebeler\@cam.cornell.edu
                         http://www.cam.cornell.edu/hiebeler/home.html

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

END_OF_LICENSE
}


#
# Set up default parameter values.
#
sub setupdefaults {
    $outFname = "-";
    $doDup = 1;
    $neutralMatchesAny = 0;
    $uConvert = 0;
}


#
# Print a usage message and exit.
#
sub printusage {
    print "Usage: $0 [-nma 0|1] [-o outFname] [-nodup] [-uu2u: | -u:2uu] file1 file2 [file3 ... fileN]\n";
    print "   -nma 0|1 : Neutral Matches Any (1 for on, 0 for off) -- for comparing two\n";
    print "              entries to see if they are the same; on means neutral tone\n";
    print "              matches any tone\n";
    print "   -o outFname : specify where to put the output (default = stdout)\n";
    print "   -nodup : Don't output duplicate entries\n";
    print "            (default = output, with extra /!!!!!/ field at end)\n";
    print "   -uu2u: : Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n";
    print "   -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n";
    exit 2;
}


#
# Read a line, removing comments which begin with "#", and ignoring
# empty lines (or lines which only have a comment).
#
$getlinelinenum = 0;
sub getline {
    if ($#_ == -1) {
	while (<>) {
	    $getlinelinenum++;
	    next if /^\s*#/;
	    next if /^\s*$/;
	    s/#.*$//;
	    chop;
	    return $_;
	}
	return;
    }
    elsif ($#_ == 0) {
	$fh = $_[0];
    }
    else {
	die "getlinefp must be called with a single argument";
    }
    while (<$fh>) {
	$getlinelinenum++;
	next if /^\s*#/;
	next if /^\s*$/;
	s/#.*$//;
	chop;
	return $_;
    }
    return;
}


#
# Return 1 if the two strings have the same pinyin, otherwise return 0
# The two strings are in $_[0] and $_[1]
#
sub samePinyin {
    my (@words1, @words2, $i, $tmpWord1, $tmpWord2);

    @words1 = split(" ", $_[0]);
    @words2 = split(" ", $_[1]);

    if (scalar(@words1) != scalar(@words2)) { return 0; }
    for ($i=0; $i < scalar(@words1); $i++) {
	if ($neutralMatchesAny) {
	    if ($words1[$i] =~ m/5$/) {
		# word1 is neutral tone, so we swap the words, since
		# we use word2 for the pattern-matching, and we want
		# to turn the tone 5 into a pattern which matches any tone.
		$tmpWord1 = $words2[$i];
		$tmpWord2 = $words1[$i];
		$tmpWord2 =~ s/5/\\d/;
	    }
	    else {
		# no need to swap, but if word2 is neutral tone, turn
		# it into a pattern which matches any tone.
		$tmpWord1 = $words1[$i];
		$tmpWord2 = $words2[$i];
		$tmpWord2 =~ s/5/\\d/;
	    }
	    if (! ($tmpWord1 =~ m/^$tmpWord2$/)) { return 0; }
	}
	else {
	    if ($words1[$i] ne $words2[$i]) { return 0; }
	}
    }
    # if we got this far, it must have been a match
    return 1;
}


#
# read in a vocabulary file
# Filename to read from is in $_[0]
# Reference to array of references to hashes to use is in $_[1]
# If this is not the first file we are reading, then $_[2] will be 1,
# telling us we should check for duplicates as we read.  If $_[2] is 0,
# then this is the first file, and we just read it, not checking for
# duplicate entries.
#
sub readvocabfile {
    my $levels;
    my $chinese;
    my $english;
    my $pinyin;
    my $arrayRef;
    my $i;

    $arrayRef = $_[1];
    open(INFILE, $_[0]) or die "Couldn't open infile '$_[0]'";
  READVOCABLOOP:
    while ($line=getline("INFILE")) {
	# handle case where line has skill level(s) at beginning
	if ($line =~ m@^\s*([0-9]+)\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
	    ($levels,$chinese,$pinyin,$english) = ($1,$2,$3,$4);
	    $chinese =~ s/\s+$//;  # truncate trailing spaces on chinese
	    $levels .= " ";
	}
	# line doen't have skill level numbers at beginning
	elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
	    ($chinese,$pinyin,$english) = ($1,$2,$3);
	    $chinese =~ s/\s+$//;  # truncate trailing spaces on chinese
	    $levels = "";
	}
	else {
	    $line =~ s/[\n\r]//;
	    print "Invalid line: `$line'\n";
	    die "Invalid line encountered";
	}

	# Convert "uu" into "u:" or vice-versa in pinyin field,
	# if the user requested it.
	if ($uConvert == $uu2uc) {
	    $pinyin =~ s/uu/u:/;
	}
	elsif ($uConvert == $uc2uu) {
	    $pinyin =~ s/u:/uu/;
	}

	if ($_[2] == 1) {  # check for duplicates
	  DUPLOOP:
	    for ($i=0; $i < $vocabIndex; $i++) {
		if ($$arrayRef[$i]->{chinese} eq $chinese) {
		    # same chinese, but check to see if they have the
		    # same pinyin
		    if (samePinyin($$arrayRef[$i]->{pinyin}, $pinyin)) {
			if ($doDup == 0) {
			    # don't do duplicates, so don't add this entry to
			    # the vocabulary list; go read the next entry.
			    next READVOCABLOOP;
			}
			else { $english =~ s@/$@/!!!!!/@; }
			last DUPLOOP;  # we found duplicate, so exit inner loop
		    }
		}
	    }
	}
	# now put everything into the main array of hashes
	$$arrayRef[$vocabIndex]->{"levels"} = $levels;
	$$arrayRef[$vocabIndex]->{"chinese"} = $chinese;
	$$arrayRef[$vocabIndex]->{"english"} = $english;
	$$arrayRef[$vocabIndex]->{"pinyin"} = $pinyin;
	$vocabIndex++;
    }
    close INFILE;
}


#
# Print out the vocabulary list
#
sub printVocab {
    my $fh = $_[0];

    foreach $word (@wordList) {
	print $fh "$word->{levels}",
	"$word->{chinese} [$word->{pinyin}] $word->{english}\n";
    }
}


##############
# Main program
##############
setupdefaults();
while ($thisarg = shift()) {
    if ($thisarg eq "-o") {
	if (!defined($outFname = shift())) { printusage; }
    }
    elsif ($thisarg eq "-nodup") { $doDup = 0; }
    elsif ($thisarg eq "-license") { printLicense(); exit(0); }
    elsif ($thisarg eq "-nma") {
	if (!defined($neutralMatchesAny = shift())) { printusage; }
    }
    elsif ($thisarg eq "-uu2u:") {
	$uConvert = $uu2uc;
    }
    elsif ($thisarg eq "-u:2uu") {
	$uConvert = $uc2uu;
    }
    else { last; }
}
open (OUTFP, ">$outFname") or die "Couldn't open output file `$outFname'\n";

# These next 2 lines are just to avoid warnings about using OUTFP once
# and it being a possible typo...
$myJunk = \*OUTFP;
$myJunk = "";

# ok, now $thisarg contains the first filename (if it's defined)
if (!defined($thisarg)) { printusage; }
@wordList = ();
$vocabIndex = 0;
readvocabfile($thisarg, \@wordList, 0);
print "# Got ", scalar(@wordList), " entries\n";
while ($thisarg = shift()) {
    my $savedVocabIndex = $vocabIndex;
    readvocabfile($thisarg, \@wordList, 1);
    print "# Got another ", $vocabIndex - $savedVocabIndex, " entries\n";
}
printVocab("OUTFP");