File: PYCreatePinYinDB.py

package info (click to toggle)
scim-python 0.1.13~rc1-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 3,436 kB
  • ctags: 2,794
  • sloc: sh: 9,774; python: 9,551; cpp: 3,420; makefile: 349; sed: 16
file content (75 lines) | stat: -rw-r--r-- 2,161 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# vim: set noet ts=4:
#
# scim-python
#
# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
#
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA  02111-1307  USA
#
# $Id: $
#
import sys, os, re
import PYSQLiteDB
import bz2

def main ():
	srcdir = "."
	if len (sys.argv) == 2:
		srcdir = sys.argv[1]

	filename = "py.db"
	try:
		os.unlink (filename)
	except:
		pass
	
	print "Create DB"
	db = PYSQLiteDB.PYSQLiteDB (filename = filename)
	db.create_tables ()
	db.init_pinyin_table ()
	db.init_shengmu_table ()

	def phrase_pinyin_parser (f):
		for l in f:
			phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
			pinyin = pinyin.replace (u"u:", u"v")
			yield (phrase, pinyin, int (freq))

	def phrase_pinyin_parser_pinyin (f):
		for l in f:
			phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
			pinyin = pinyin.replace (u"u:", u"v")
			yield (phrase, pinyin, int (freq)*1300)

	print "Load pinyin_table.txt"
	filename = os.path.join (srcdir, "../../../data/pinyin_table.txt")
	db.add_phrases (phrase_pinyin_parser_pinyin (file (filename)))

	print "Load phrase_pinyin.txt.bz2"
	filename = os.path.join (srcdir, "phrase_pinyin.txt.bz2")
	bzf = bz2.BZ2File (filename, "r")
	db.add_phrases (phrase_pinyin_parser (bzf))
	
	print "Load phrase_pinyin_duoyin.txt"
	filename = os.path.join (srcdir, "phrase_pinyin_duoyin.txt")
	db.add_phrases (phrase_pinyin_parser (file (filename)))

	print "Optimizing database"
	db.optimize_database ()
	
if __name__ == "__main__":
	main ()