File: PYUtil.py

package info (click to toggle)
scim-python 0.1.13~rc1-3
links: PTS, VCS
area: main
in suites: squeeze
size: 3,436 kB
ctags: 2,794
sloc: sh: 9,774; python: 9,551; cpp: 3,420; makefile: 349; sed: 16
file content (152 lines) | stat: -rw-r--r-- 4,233 bytes
# -*- coding: utf-8 -*-
# vim: set noet ts=4:
#
# scim-python
#
# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
#
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA  02111-1307  USA
#
# $Id: $
#
from PYDict import *

class PinYinWord:
	correct_dict = {"nve" : "nue", "lve" : "lue"}
	def __init__ (self, pinyin):
		if pinyin in self.correct_dict:
			pinyin = self.correct_dict [pinyin]

		self._pinyin = pinyin
		self._is_completed = self.is_valid_pinyin ()
		if self._is_completed:
			sheng_mu, yun_mu = self.split ()
			self._pinyin_id = PINYIN_DICT [self._pinyin]
			self._sheng_mu_id = SHENGMU_DICT [sheng_mu]
		else:
			self._sheng_mu_id = SHENGMU_DICT [self._pinyin]
	
	def is_valid_pinyin (self):
		return PINYIN_DICT.has_key (self._pinyin)

	def get_sheng_mu_id (self):
		return self._sheng_mu_id

	def get_shengmu (self):
		return ID_SHENGMU_DICT[self._sheng_mu_id]

	def get_pinyin_id (self):
		return self._pinyin_id

	def get_pinyin (self):
		return self._pinyin

	def get_pattern (self, mohu = False):
		if mohu == False:
			if self.is_valid_pinyin ():
				return self._pinyin
			else:
				return self._pinyin + "%"
		else:
			if not self.is_valid_pinyin ():
				if self._pinyin in ("zh", "ch", "sh"):
					return self._pinyin[0] + "%"
				return self._pinyin + "%"
			else:
				shengmu = self.get_shengmu ()
				yunmu = self._pinyin [len (shengmu):]
				if shengmu in ("zh", "ch", "sh", "z", "c", "s"):
					shengmu = shengmu[0] + "%"
				if yunmu in ("ing", "in", "en", "eng", "an", "ang"):
					yunmu = yunmu[0:2] + "%"
				return shengmu + yunmu

	def split (self):
		if not self.is_valid_pinyin ():
			raise Exception ("Pinyin '%s' is not a valid pinyin!" % py)
		if self._pinyin[:2] in SHENGMU_DICT.keys ():
			return self._pinyin[:2], self._pinyin[2:]
		elif self._pinyin[:1] in SHENGMU_DICT.keys ():
			return self._pinyin[:1], self._pinyin[1:]
		else:
			return "", self._pinyin[:]

	def __str__ (self):
		return self._pinyin

class PinYinString:
	def __init__ (self, string):
		pass

def load_pinyin_table (_file):
	
	def pinyin_table_parser (f):
		for l in f:
			a = unicode (l, "utf-8").strip ().split ()
			hanzi, pinyin, freq = a 
			yield (hanzi, pinyin, int (freq))
	# db.add_phrases (pinyin_table_parser (bzf))
	
	hanzi_dic = {}
	for hanzi, pinyin, freq in pinyin_table_parser (_file):
		if not hanzi_dic.has_key (hanzi):
			hanzi_dic[hanzi] = {}
		
		if hanzi_dic[hanzi].has_key (pinyin):
			hanzi_dic[hanzi][pinyin] += freq
		else:
			hanzi_dic[hanzi][pinyin] = freq

	return hanzi_dic

def load_phrase_pinyin_freq (_file):
	def phrase_pinyin_parser (f):
		for l in f:
			phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
			pinyin = pinyin.replace (u"u:", u"v")
			yield (phrase, pinyin, int (freq))
	phrases_dic = {}
	for phrase, pinyin, freq in phrase_pinyin_parser (_file):
		if not phrases_dic.has_key (phrase):
			phrases_dic[phrase] = []
		phrases_dic[phrase].append ((phrase, pinyin, freq))

	return phrases_dic

def load_phrase_pinyin (_file):
	def phrase_pinyin_parser (f):
		for l in f:
			phrase, pinyin = unicode (l, "utf-8").strip ().split ()
			pinyin = pinyin.replace (u"u:", u"v")
			yield (phrase, pinyin, 0)
	phrases_dic = {}
	for phrase, pinyin, freq in phrase_pinyin_parser (_file):
		if not phrases_dic.has_key (phrase):
			phrases_dic[phrase] = []
		phrases_dic[phrase].append ((phrase, pinyin, freq))

	return phrases_dic

def load_sogou_phrases (_file):
	import re
	dic = {}
	for l in _file:
		w = unicode (l, "utf8")
		w = re.split (ur"\t+", w)
		dic [w[0]] = (w[0], int (w[1]))
	return dic