1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
|
#!/usr/bin/perl -w
#
# Copyright (c) 1998 David Hiebeler
# For licensing information, see the "printLicense" function
# down around line 85.
#
# File: cedictmerge, version 1.1
# By: David Hiebeler
# Center for Applied Math
# Cornell University
# Ithaca, NY 14853
# hiebeler@cam.cornell.edu
# http://www.cam.cornell.edu/hiebeler/home.html
#
# Version 1.1: December 1998
# Version 1.0: July 1998
#
#
# This is a perl script for merging two CEDICT-format files (see
# "http://www.mindspring.com/~paul_denisowski/cedict.html" for
# information about CEDICT).
#
# Usage: cedictmerge [-o outFile] [-nodup] file1 file2 [file3 file4 ... fileN]
#
# Consider file1 to be the "original" or "main" file, and the other
# files will be merged into it. Consider file2 being merged in.
# If an entry in file2 is not in file1, then it will be appended to
# the end of file1's data (however, file1 will not be changed -- the
# merged data will be written to outFile if specified, otherwise stdout).
# If an entry in file2 is in file1, then it will still be appended to
# the end of the merged file, but with the special field "/!!!!!/" appended
# to its English definition (so that one can easily find such entries
# using an editor later, and manually merge duplicate entries).
#
# If the "-nodup" flag is specified, duplicate entries from file2 will not
# appended to the output data.
#
# You can use the "-nma x" argument to se the "NeutralMatchesAny" flag.
# The value "x" should be either 0 or 1. If you use 1, it means a neutral
# tone (i.e. tone 5) matches any tone. This is because it's a pretty
# common mistake (at least for me) to put the "intrinsic tone" in the pinyin
# field for a character, if I don't realize the character's tone becomes
# neutral in that particular word. This helps catch such mistakes.
#
# Use can use the "-uu2u:" command-line argument to turn pinyin entries
# like "nuu3" into "nu:3", and the "-u:2uu" argument to do the opposite,
# i.e. turn "nu:3" into "nuu3". (This feature is available because
# both forms have appeared in various versions of CEDICT).
#
# This script should work correctly on both GB and BIG5 files.
#
# Note that this script will exit if it encounters any lines not
# in cedict format, with the following exception: it will ignore (and
# discard) any blank lines, and discard any comments which begin
# with '#' (whether the comment is the only thing on a line, or at the
# end of a line). You may want to use the "cedictcheckformat" script
# first to catch any lines in your vocabulary file which are not in strict
# CEDICT format. You may also want to use "cedictsort" after merging,
# to sort the results.
#
# Also note that this script checks each new word against the list of
# all words it has seen so far. In particular, this means it will catch
# duplicate entries within a single file (even the first file), not only
# duplicates between different files.
#
# Wishlist:
# o) Allow the option of ignoring the pinyin field, since then it could
# catch entries which have mistakes in the pinyin (other than just
# mistakes about characters changing to neutral tone). It would probably
# be best to do this only for multi-character words, so that it wouldn't
# flag all of the single characters which have multiple pronunciations.
#
# History:
# 10 Dec 1998: added code to turn "uu" into "u:" or vice-versa in the
# pinyin field if the user requests it, to handle the fact that both
# forms have been present in cedict for some time now.
# 29 July 1998: original version, 1.0
# Define a couple of constants
$uu2uc = 1;
$uc2uu = 2;
sub printLicense {
print <<"END_OF_LICENSE";
cedictsort version 1.1 June 10, 1999
Copyright (C) 1998,1999 David Hiebeler
Center for Applied Math
Cornell University
Ithaca, NY 14853
hiebeler\@cam.cornell.edu
http://www.cam.cornell.edu/hiebeler/home.html
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
END_OF_LICENSE
}
#
# Set up default parameter values.
#
sub setupdefaults {
$outFname = "-";
$doDup = 1;
$neutralMatchesAny = 0;
$uConvert = 0;
}
#
# Print a usage message and exit.
#
sub printusage {
print "Usage: $0 [-nma 0|1] [-o outFname] [-nodup] [-uu2u: | -u:2uu] file1 file2 [file3 ... fileN]\n";
print " -nma 0|1 : Neutral Matches Any (1 for on, 0 for off) -- for comparing two\n";
print " entries to see if they are the same; on means neutral tone\n";
print " matches any tone\n";
print " -o outFname : specify where to put the output (default = stdout)\n";
print " -nodup : Don't output duplicate entries\n";
print " (default = output, with extra /!!!!!/ field at end)\n";
print " -uu2u: : Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n";
print " -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n";
exit 2;
}
#
# Read a line, removing comments which begin with "#", and ignoring
# empty lines (or lines which only have a comment).
#
$getlinelinenum = 0;
sub getline {
if ($#_ == -1) {
while (<>) {
$getlinelinenum++;
next if /^\s*#/;
next if /^\s*$/;
s/#.*$//;
chop;
return $_;
}
return;
}
elsif ($#_ == 0) {
$fh = $_[0];
}
else {
die "getlinefp must be called with a single argument";
}
while (<$fh>) {
$getlinelinenum++;
next if /^\s*#/;
next if /^\s*$/;
s/#.*$//;
chop;
return $_;
}
return;
}
#
# Return 1 if the two strings have the same pinyin, otherwise return 0
# The two strings are in $_[0] and $_[1]
#
sub samePinyin {
my (@words1, @words2, $i, $tmpWord1, $tmpWord2);
@words1 = split(" ", $_[0]);
@words2 = split(" ", $_[1]);
if (scalar(@words1) != scalar(@words2)) { return 0; }
for ($i=0; $i < scalar(@words1); $i++) {
if ($neutralMatchesAny) {
if ($words1[$i] =~ m/5$/) {
# word1 is neutral tone, so we swap the words, since
# we use word2 for the pattern-matching, and we want
# to turn the tone 5 into a pattern which matches any tone.
$tmpWord1 = $words2[$i];
$tmpWord2 = $words1[$i];
$tmpWord2 =~ s/5/\\d/;
}
else {
# no need to swap, but if word2 is neutral tone, turn
# it into a pattern which matches any tone.
$tmpWord1 = $words1[$i];
$tmpWord2 = $words2[$i];
$tmpWord2 =~ s/5/\\d/;
}
if (! ($tmpWord1 =~ m/^$tmpWord2$/)) { return 0; }
}
else {
if ($words1[$i] ne $words2[$i]) { return 0; }
}
}
# if we got this far, it must have been a match
return 1;
}
#
# read in a vocabulary file
# Filename to read from is in $_[0]
# Reference to array of references to hashes to use is in $_[1]
# If this is not the first file we are reading, then $_[2] will be 1,
# telling us we should check for duplicates as we read. If $_[2] is 0,
# then this is the first file, and we just read it, not checking for
# duplicate entries.
#
sub readvocabfile {
my $levels;
my $chinese;
my $english;
my $pinyin;
my $arrayRef;
my $i;
$arrayRef = $_[1];
open(INFILE, $_[0]) or die "Couldn't open infile '$_[0]'";
READVOCABLOOP:
while ($line=getline("INFILE")) {
# handle case where line has skill level(s) at beginning
if ($line =~ m@^\s*([0-9]+)\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
($levels,$chinese,$pinyin,$english) = ($1,$2,$3,$4);
$chinese =~ s/\s+$//; # truncate trailing spaces on chinese
$levels .= " ";
}
# line doen't have skill level numbers at beginning
elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
($chinese,$pinyin,$english) = ($1,$2,$3);
$chinese =~ s/\s+$//; # truncate trailing spaces on chinese
$levels = "";
}
else {
$line =~ s/[\n\r]//;
print "Invalid line: `$line'\n";
die "Invalid line encountered";
}
# Convert "uu" into "u:" or vice-versa in pinyin field,
# if the user requested it.
if ($uConvert == $uu2uc) {
$pinyin =~ s/uu/u:/;
}
elsif ($uConvert == $uc2uu) {
$pinyin =~ s/u:/uu/;
}
if ($_[2] == 1) { # check for duplicates
DUPLOOP:
for ($i=0; $i < $vocabIndex; $i++) {
if ($$arrayRef[$i]->{chinese} eq $chinese) {
# same chinese, but check to see if they have the
# same pinyin
if (samePinyin($$arrayRef[$i]->{pinyin}, $pinyin)) {
if ($doDup == 0) {
# don't do duplicates, so don't add this entry to
# the vocabulary list; go read the next entry.
next READVOCABLOOP;
}
else { $english =~ s@/$@/!!!!!/@; }
last DUPLOOP; # we found duplicate, so exit inner loop
}
}
}
}
# now put everything into the main array of hashes
$$arrayRef[$vocabIndex]->{"levels"} = $levels;
$$arrayRef[$vocabIndex]->{"chinese"} = $chinese;
$$arrayRef[$vocabIndex]->{"english"} = $english;
$$arrayRef[$vocabIndex]->{"pinyin"} = $pinyin;
$vocabIndex++;
}
close INFILE;
}
#
# Print out the vocabulary list
#
sub printVocab {
my $fh = $_[0];
foreach $word (@wordList) {
print $fh "$word->{levels}",
"$word->{chinese} [$word->{pinyin}] $word->{english}\n";
}
}
##############
# Main program
##############
setupdefaults();
while ($thisarg = shift()) {
if ($thisarg eq "-o") {
if (!defined($outFname = shift())) { printusage; }
}
elsif ($thisarg eq "-nodup") { $doDup = 0; }
elsif ($thisarg eq "-license") { printLicense(); exit(0); }
elsif ($thisarg eq "-nma") {
if (!defined($neutralMatchesAny = shift())) { printusage; }
}
elsif ($thisarg eq "-uu2u:") {
$uConvert = $uu2uc;
}
elsif ($thisarg eq "-u:2uu") {
$uConvert = $uc2uu;
}
else { last; }
}
open (OUTFP, ">$outFname") or die "Couldn't open output file `$outFname'\n";
# These next 2 lines are just to avoid warnings about using OUTFP once
# and it being a possible typo...
$myJunk = \*OUTFP;
$myJunk = "";
# ok, now $thisarg contains the first filename (if it's defined)
if (!defined($thisarg)) { printusage; }
@wordList = ();
$vocabIndex = 0;
readvocabfile($thisarg, \@wordList, 0);
print "# Got ", scalar(@wordList), " entries\n";
while ($thisarg = shift()) {
my $savedVocabIndex = $vocabIndex;
readvocabfile($thisarg, \@wordList, 1);
print "# Got another ", $vocabIndex - $savedVocabIndex, " entries\n";
}
printVocab("OUTFP");
|