1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
|
# vim: set noet ts=4:
#
# scim-python
#
# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
#
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
#
# $Id: $
#
import sys, os, re
import PYSQLiteDB
import bz2
def main ():
srcdir = "."
if len (sys.argv) == 2:
srcdir = sys.argv[1]
filename = "py.db"
try:
os.unlink (filename)
except:
pass
print "Create DB"
db = PYSQLiteDB.PYSQLiteDB (filename = filename)
db.create_tables ()
db.init_pinyin_table ()
db.init_shengmu_table ()
def phrase_pinyin_parser (f):
for l in f:
phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
pinyin = pinyin.replace (u"u:", u"v")
yield (phrase, pinyin, int (freq))
def phrase_pinyin_parser_pinyin (f):
for l in f:
phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
pinyin = pinyin.replace (u"u:", u"v")
yield (phrase, pinyin, int (freq)*1300)
print "Load pinyin_table.txt"
filename = os.path.join (srcdir, "../../../data/pinyin_table.txt")
db.add_phrases (phrase_pinyin_parser_pinyin (file (filename)))
print "Load phrase_pinyin.txt.bz2"
filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
bzf = bz2.BZ2File (filename, "r")
db.add_phrases (phrase_pinyin_parser (bzf))
print "Load phrase_pinyin_duoyin.txt"
filename = os.path.join (srcdir, "phrase_pinyin_duoyin.txt")
db.add_phrases (phrase_pinyin_parser (file (filename)))
print "Optimizing database"
db.optimize_database ()
if __name__ == "__main__":
main ()
|