File: ZhengJu.py

package info (click to toggle)
scim-python 0.1.13~rc1-3
links: PTS, VCS
area: main
in suites: squeeze
size: 3,436 kB
ctags: 2,794
sloc: sh: 9,774; python: 9,551; cpp: 3,420; makefile: 349; sed: 16
file content (1268 lines) | stat: -rw-r--r-- 41,169 bytes
# -*- coding: utf-8 -*-
# vim: set noet ts=4:
#
# scim-python
#
# Copyright (c) 2007-2008 Yu Fan <yufanyufan@gmail.com>
#
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA  02111-1307  USA
#
# $Id: $
#
import scim
import scim.Log
import os
from scim import KeyCode
from scim import KeyMask
from scim import Property
import traceback
from PYDict import *
from gettext import dgettext
from ZhengJuDB import *
import scim.ascii as ascii
from sets import Set
import popen2

_ = lambda a : dgettext ("scim-python", a)
RGB = lambda r, g, b : (((r & 0xff) << 16) | ((g & 0xff) << 8) | (b & 0xff))
IMEngine = scim.IMEngine
IMEngineFactory = scim.IMEngineFactory

(YLEN, Y0, Y1, Y2, Y3, YX, PHRASE, ADJ_FREQ) = range (0, 8)
candidate_sort = lambda x,y: cmp(y[YLEN],x[YLEN]) if x[YLEN] != y[YLEN] else cmp(y[ADJ_FREQ],x[ADJ_FREQ])

class InputException(Exception):
	def __init__ (self):
		Exception.__init__(self)

class PinYinWord:
	def __init__ (self, shengmu = "", yunmu = "", pinyin = ""):
		self.char = ""
		self._pinyin_id = None
		self.manual = None
		self.char = ""
		self.spliter = ""
		if pinyin:
			self.set_pinyin(pinyin)
		else:
			self.set_pinyin(shengmu + yunmu)
	def set_pinyin(self,pinyin):
		if pinyin[0] == "'":
			self.spliter = "'"
			pinyin = pinyin[1:]
		if pinyin[:2] in SHENGMU_LIST:
			self.shengmu = pinyin[:2]
			self.yunmu = pinyin[2:]
		elif pinyin[:1] in SHENGMU_LIST:
			self.shengmu = pinyin[:1]
			self.yunmu = pinyin[1:]
		else:
			self.shengmu = ""
			self.yunmu = pinyin
		if self.get_pinyin() in PINYIN_LIST:
			self._pinyin_id = PINYIN_DICT [self.get_pinyin()]
			self._sheng_mu_id = SHENGMU_DICT [self.get_shengmu()]
		else:
			self._sheng_mu_id = SHENGMU_DICT [self.get_shengmu()]
	def mohuyin(self):
		pinyin = ID_PINYIN_DICT[self.real_pinyin_id]
		if pinyin[:2] in SHENGMU_LIST:
			self.shengmu = pinyin[:2]
			yunmu = pinyin[2:]
		elif pinyin[:1] in SHENGMU_LIST:
			self.shengmu = pinyin[:1]
			yunmu = pinyin[1:]
		else:
			self.shengmu = ""
			yunmu = pinyin
		if self.yunmu != "":
			 self.yunmu = yunmu
		self.set_pinyin(self.get_pinyin())
	def get_sheng_mu_id (self):
		return self._sheng_mu_id

	def get_pinyin_id (self):
		return self._pinyin_id

	def set_pinyin_id (self, id):
		self.set_pinyin(ID_PINYIN_DICT[id])
		
	def get_shengmu (self):
		return self.shengmu
	
	def set_yunmu( self,yunmu):
		self.yunmu = yunmu
		if(yunmu != ""):
			self._pinyin_id = PINYIN_DICT [ self.get_pinyin() ]
		else:
			self._pinyin_id = None

	def set_char (self,char):
		self.char = char

	def get_pinyin (self):
		return self.shengmu + self.yunmu

	def get_screen_pinyin (self):
		return self.spliter + self.shengmu + self.yunmu

	def __str__ (self):
		return self.get_pinyin()

	def is_complete (self):
		return self._pinyin_id != None

		
class Editor:
	database = None
	def __init__ (self, config = None):
		if config == None:
			config = PseudoConfig()
		if Editor.database == None:
			Editor.database = ZhengJuDB(config)
		self.lookup_table = scim.LookupTable (9)
		self.lookup_table.fix_page_size(True)
		self.clear()
		self.config = config
		self.load_config(config)

	def clear(self):
		self.cursor = 0
		self.wordlist = []
		self.pinyinlist = []
		self.candidates = []
		self.predict = []
		self.lookup_table.clear()
		self.lookup_table.show_cursor(False)
		Editor.database.clear_cache()
	def load_config(self, config):
		Editor.database.load_config(config)
		self.userword = config.read ("/IMEngine/Python/ZhengJu/CreateUserWords", True)
		self.userphrase = config.read ("/IMEngine/Python/ZhengJu/CreateUserPhrases", True)
		self.adjustfreq = config.read ("/IMEngine/Python/ZhengJu/AdjustWordFreq", True)
		self.logconverror = config.read ("/IMEngine/Python/ZhengJu/LogConvError", True)
		self.splitpinyin = config.read ("/IMEngine/Python/ZhengJu/SplitPinyin", True)
		self.enable_mohuyin = config.read ("/IMEngine/Python/ZhengJu/FuzzyPinyin", False)
		self.mohuyin_s_sh = config.read ("/IMEngine/Python/ZhengJu/FuzzyS_Sh", True)
		self.mohuyin_c_ch = config.read ("/IMEngine/Python/ZhengJu/FuzzyC_Ch", True)
		self.mohuyin_z_zh = config.read ("/IMEngine/Python/ZhengJu/FuzzyZ_Zh", True)
		self.mohuyin_l_n = config.read ("/IMEngine/Python/ZhengJu/FuzzyL_N", True)
		self.mohuyin_in_ing = config.read ("/IMEngine/Python/ZhengJu/FuzzyIn_Ing", True)
		self.mohuyin_en_eng = config.read ("/IMEngine/Python/ZhengJu/FuzzyEn_Eng", True)
		self.mohuyin_an_ang = config.read ("/IMEngine/Python/ZhengJu/FuzzyAn_Ang", True)
		self.build_mohuyin()
	def build_mohuyin(self):
		self.shengmu_mohu = {}
		if self.mohuyin_s_sh:
			self.shengmu_mohu["s"]= MOHU_SHENGMU["s"]
			self.shengmu_mohu["sh"]= MOHU_SHENGMU["sh"]
		if self.mohuyin_z_zh:
			self.shengmu_mohu["z"]= MOHU_SHENGMU["z"]
			self.shengmu_mohu["zh"]= MOHU_SHENGMU["zh"]
		if self.mohuyin_c_ch:
			self.shengmu_mohu["c"]= MOHU_SHENGMU["c"]
			self.shengmu_mohu["ch"]= MOHU_SHENGMU["ch"]
		if self.mohuyin_l_n:
			self.shengmu_mohu["l"]= MOHU_SHENGMU["l"]
			self.shengmu_mohu["n"]= MOHU_SHENGMU["n"]
		self.yunmu_mohu = {}
		if self.mohuyin_an_ang:
			self.yunmu_mohu["an"] = MOHU_YUNMU["an"]
			self.yunmu_mohu["ang"] = MOHU_YUNMU["ang"]
		if self.mohuyin_en_eng:
			self.yunmu_mohu["en"] = MOHU_YUNMU["en"]
			self.yunmu_mohu["eng"] = MOHU_YUNMU["eng"]
		if self.mohuyin_in_ing:
			self.yunmu_mohu["in"] = MOHU_YUNMU["in"]
			self.yunmu_mohu["in"] = MOHU_YUNMU["ing"]
	def current (self):
		if self.pinyinlist:
			return self.pinyinlist[-1]
		else:
			return None

	def is_empty (self):
		return (not self.pinyinlist) and (not self.wordlist)

	def is_end (self):
		return self.is_empty() or (not self.pinyinlist) and self.cursor == len (self.wordlist)

	def get_aux (self):
		return "".join ( u[PHRASE] for u in self.predict)

	def get_screen_pinyin(self):
		if self.splitpinyin:
			s = ""
			if self.pinyinlist:
				for i in range(len(self.pinyinlist)-1):
					p = self.pinyinlist[i].get_screen_pinyin() + self.pinyinlist[i+1].get_screen_pinyin()[0]
					if p in PINYIN_LIST or p in PINYIN_PARTIAL_LIST:
						s += self.pinyinlist[i].get_screen_pinyin() + "'"
					else:
						s += self.pinyinlist[i].get_screen_pinyin()
				s += self.pinyinlist[-1].get_screen_pinyin()
			return s
		else:
			return u"".join( i.get_screen_pinyin() for i in self.pinyinlist)

	def get_preedit (self):			
		return u"".join( [i.char for i in self.wordlist[0:self.cursor] ] ) +\
			self.get_screen_pinyin() + \
			u"".join ( [i.char for i in self.wordlist[self.cursor:]] )

	def get_screen_cursor (self):
		if len(self.get_screen_pinyin())>0:
			return self.cursor + len(self.get_screen_pinyin())
		else:
			return self.cursor
	
	def pinyin_select (self, candidate, manual = False):
		phrase = candidate[PHRASE]
		length = len(phrase)
		for i in range(0,length):
			self.pinyinlist[i].set_char(phrase[i])
			if i<4:
				#~ self.pinyinlist[i].set_pinyin_id (candidate[i+1])
				self.pinyinlist[i].real_pinyin_id = candidate[i+1]
				if self.enable_mohuyin:
					self.pinyinlist[i].mohuyin()
			else:
				py = candidate[YX].split("'")
				self.pinyinlist[i].real_pinyin_id = PINYIN_DICT[py[i-4]]
				if self.enable_mohuyin:
					self.pinyinlist[i].mohuyin()
				#~ self.pinyinlist[i].set_pinyin ([py[i-4]])
			self.pinyinlist[i].manual = manual
		self.wordlist[self.cursor:self.cursor] = self.pinyinlist[:length]
		del self.pinyinlist[:length]
		self.cursor += length
		if manual:
			self.update()
	def reparse_backtrace(self):
		if self.cursor < len(self.wordlist):
			i = self.cursor
			while i >= 0:
				if self.wordlist[i].manual:
					break
				i-=1
			i += 1
			self.reparse(i)
		
	def convert_all (self):
		predicts = self.predict
		for i in predicts:
			self.pinyin_select(i)		
		self.reparse_backtrace()
		self.update ()

	def jump_to_next_word(self):
		string = self.get_preedit ()
		phrase_list = self.split_phrase (string)
		p = 0;
		for i in phrase_list:
			if p <= self.cursor:
				p += i[1]
			else:
				break
		self.cursor = p
		self.update ()
		#~ predict = self.get_predict (self.wordlist[self.cursor:])
		#~ self.cursor += predict[0][YLEN]
		#~ self.update ()

	def predict_len(self, predicts):
		return sum (u[YLEN] for u in predicts)

	def auto_convert (self):
		self.update_predict()
		while self.predict_len(self.predict[:2]) < len (self.pinyinlist):
			self.pinyin_select(self.predict[0])
			self.update_predict()
		self.update_candidates()

	def auto_convert_quanpin(self):
		#~ self.update_predict()
		p = self.pinyinlist[-1].get_pinyin()
		if p not in SHENGMU_LIST and p not in PINYIN_PARTIAL_LIST:
			for t in PINYIN_LIST:
				if p != t and t[:len(p)] == p :
					while self.predict_len(self.predict[:2]) + 1 < len (self.pinyinlist):
						self.pinyin_select(self.predict[0])
						self.update_predict()
					self.update_candidates()
					return
		while self.predict_len(self.predict[:2]) < len (self.pinyinlist):
			self.pinyin_select(self.predict[0])	
			self.update_predict()
		self.update_candidates()

	def update (self):
		self.candidate_cursor = None
		self.update_predict()
		self.update_candidates()

	def update_predict (self):
		if self.pinyinlist:
			self.predict = self.get_predict_pinyinlist (self.pinyinlist) 
		else:
			self.predict = []

	def reverse(self, phrase):
		self.clear()
		while phrase:
			for i in range(len(phrase),0,-1):
				#~ print len(phrase),phrase[:i], i
				temp = self.database.select_phrase(phrase[:i])
				if temp:
					result = temp[0]
					break
			else:
				raise InputException()
			for i in range(result[YLEN]):
				pinyin = PinYinWord("'","")
				pinyin.set_char(phrase[i])
				if i < 4:
					pinyin.set_pinyin_id (result[i+1])
				else:
					#print candidate[YX]
					py = result[YX].split("'")
					#print py[i-5]
					pinyin.set_pinyin_id (PINYIN_DICT[py[i-4]])
					#print self.pinyinlist[i]._pinyin_id
				pinyin.real_pinyin_id=pinyin._pinyin_id
				self.wordlist.append(pinyin)
			phrase = phrase[result[YLEN]:]
	def split_phrase (self, string):
		start = 0
		phrase_list = []
		while start < len(self.wordlist)-1:
			candidate = None
			if len(self.wordlist) - start >= 3:
				phrase = Editor.database.select_words_by_pinyin_list_all (self.wordlist[start:start + 3])
				#~ print len(phrase)
				for i in phrase:
					if i[PHRASE] == string[start:start + len(i[PHRASE]) ]:
						if not candidate or candidate[PHRASE] < i[PHRASE]:
							candidate = i
				if candidate == None:
					phrase = Editor.database.select_words_by_phrase (self.wordlist[start:start+2])
					if phrase:
						candidate = phrase[0]			
			else:
				phrase = Editor.database.select_words_by_phrase (self.wordlist[start:start+2])
				if phrase:
					candidate = phrase[0]
			if candidate == None:
				phrase_list.append ( (start, 1, string[start]))
				start += 1
			else:
				phrase_list.append ( (start, len(candidate[PHRASE]), candidate[PHRASE]))
				start += len(candidate[PHRASE])
		if start < len (self.wordlist):
			phrase_list.append ((start,1,string[-1]))
		return phrase_list
	def split_phrasev2 (self, string):
		start = 0
		phrase_list = []
		while start < len(self.wordlist)-1:
			phrase = Editor.database.select_words_by_pinyin_list_all (self.wordlist[start:start+2])
			candidate = None
			for i in phrase:
				if i[PHRASE] == string[start:start + len(i[PHRASE]) ]:
					if not candidate or candidate[PHRASE] < i[PHRASE]:
						candidate = i
			if candidate == None:
				phrase_list.append ( (start, 1, string[start]))
				start += 1
			else:
				phrase_list.append ( (start, len(candidate[PHRASE]), candidate[PHRASE]))
				start += len(candidate[PHRASE])
		if start < len (self.wordlist):
			phrase_list.append ((start,1,string[-1]))
		return phrase_list

	def learn_user_words(self, phrase_list, string, sentence):
		if not self.userword:
			return
		start = 0
		while start < len (phrase_list):
			tmp_phrase_start = phrase_list[start][0]
			tmp_phrase = ""
			while start < len (phrase_list) and phrase_list[start][1] == 1 \
				and string[phrase_list[start][0]] != sentence[phrase_list[start][0]]:
				tmp_phrase += phrase_list[start][2]
				del phrase_list[start]			
			if tmp_phrase:
				phrase_list.insert (start, (tmp_phrase_start, len(tmp_phrase), tmp_phrase) )
			if len (tmp_phrase) > 1:
				Editor.database.add_phrase (self.wordlist[tmp_phrase_start:tmp_phrase_start + len(tmp_phrase)], USER_WORD) 	
				self.log_conv_error( sentence, string, phrase_list, tmp_phrase_start, tmp_phrase_start, 0)
				string = string[:tmp_phrase_start] + sentence[tmp_phrase_start:tmp_phrase_start + len(tmp_phrase)] + string[tmp_phrase_start + len(tmp_phrase):]
			start += 1
		return string
		

	def split_predict (self):
		predict = []
		start = 0
		while start < len (self.wordlist):
			p = self.get_predict (self.wordlist[start:])
			predict.append ( (start,len(p[0][PHRASE]), p[0][PHRASE]) )
			start += len (p[0][PHRASE])
		return predict

	def addphrase (self, phrase_list, pstart, pend, freq):
		if pstart < 0:
			return
		if pend >= len (phrase_list):
			return
		Editor.database.add_phrase(\
			self.wordlist[phrase_list[pstart][0]:(phrase_list[pend][0]+phrase_list[pend][1])], freq)

	def adjust_all_freq (self, phrase_list):
		if not self.userphrase:
			return
		p = [ self.wordlist[i[0]:i[0]+i[1]] for i in phrase_list]
		for i in p:
			Editor.database.adjust_phrase_freq (i)

	def adjust_freq (self, phrase_list, phrase_begin):
		if not self.adjustfreq:
			return
		i = phrase_list[phrase_begin]
		p = self.wordlist[i[0]:i[0]+i[1]]
		Editor.database.adjust_phrase_freq (p)
				
	def delete_phrase(self, n):
		if n >= self.lookup_table.get_current_page_size():
			raise InputException()
		candidate = self.candidates[self.lookup_table.get_current_page_start() + n]
		if candidate[ADJ_FREQ] == 0 or \
			candidate[ADJ_FREQ]%USER_PHRASE and candidate[ADJ_FREQ]%USER_WORD:
			raise InputException()
		Editor.database.remove_phrase (candidate)
		self.update ()
	
	def delete_cursor_phrase(self):
		candidate = self.candidates[self.lookup_table.get_cursor_pos() ]
		if candidate[ADJ_FREQ] == 0 or \
			candidate[ADJ_FREQ]%USER_PHRASE and candidate[ADJ_FREQ]%USER_WORD:
			self.candidate_cursor = None
			raise InputException()
		Editor.database.remove_phrase (candidate)
		self.update ()

	def log_conv_error(self, predict, sentence, phrase_list, pstart, pend, type):
		if pstart < 0:
			return
		if pend >= len (phrase_list):
			return
		if self.logconverror:
			begin = phrase_list[pstart][0]
			end = phrase_list[pend][0]+phrase_list[pend][1]
			p = open(os.path.expanduser ("~/.scim/zhengju-conv-error.log"),'a')
			print >> p, sentence[begin:end].encode ("utf-8"), predict[begin:end].encode ("utf-8"), type
			p.close ()
			
	def learn (self):
		if not self.userword and not self.userphrase and not self.adjustfreq and not self.logconverror:
			return
		predict = self.split_predict ()
		sentence = u"".join ([ i[2] for i in predict])
		for i in self.wordlist:
			i._pinyin_id = i.real_pinyin_id
		string = self.get_preedit ()
		phrase_list = self.split_phrase (string)
		string = self.learn_user_words(phrase_list, string, sentence)
		#~ print "out"
		#~ for i in phrase_list:
			#~ print i[1],i[2]
		#~ for i in predict:
			#~ print i[1],i[2]
		if not self.userphrase and not self.adjustfreq:
			return
		cur_phrase = 0
		cur_predict = 0
		phrase_begin = 0
		predict_begin = 0
		while cur_phrase < len(phrase_list):
			while predict[cur_predict][0]+ predict[cur_predict][1] < phrase_list[cur_phrase][0] + phrase_list[cur_phrase][1]:
				cur_predict += 1
			if predict[cur_predict][0]+ predict[cur_predict][1] > phrase_list[cur_phrase][0] + phrase_list[cur_phrase][1]:
				cur_phrase += 1
			else:
				#~ print string[phrase_list[phrase_begin][0]:phrase_list[cur_phrase][0]+phrase_list[cur_phrase][1]]
				#~ print sentence[predict[predict_begin][0]:predict[cur_predict][0]+predict[cur_predict][1]]
				if string[phrase_list[phrase_begin][0]:phrase_list[cur_phrase][0] + phrase_list[cur_phrase][1]]!=\
					sentence[predict[predict_begin][0]:predict[cur_predict][0] + predict[cur_predict][1]]:
					if cur_phrase - phrase_begin == 0:
						if cur_predict - predict_begin == 0:
							self.addphrase(phrase_list, phrase_begin - 1, cur_phrase, USER_PHRASE)
							self.log_conv_error(sentence, string, phrase_list, phrase_begin - 1, cur_phrase, 1)
							self.addphrase(phrase_list, phrase_begin, cur_phrase + 1, USER_PHRASE)
							self.log_conv_error(sentence, string, phrase_list, phrase_begin, cur_phrase + 1, 1)
							self.adjust_freq (phrase_list, phrase_begin)
					else:
						self.addphrase (phrase_list, phrase_begin, cur_phrase, USER_PHRASE)
						self.log_conv_error(sentence, string, phrase_list, phrase_begin, cur_phrase, 2)
				phrase_begin = cur_phrase + 1
				predict_begin = cur_predict + 1
				cur_phrase += 1
		Editor.database.clean_useless_phrase()
	
	def freq_alg(self, phrase1, phrase2):
		freq = 0
		if len(phrase1[PHRASE]) == 1:
			freq += phrase1[ADJ_FREQ] * 10
		elif len(phrase1[PHRASE]) < 4:
			freq += phrase1[ADJ_FREQ] * len(phrase1[PHRASE])
		else:
			freq += phrase1[ADJ_FREQ] * pow( len(phrase1[PHRASE]) , 2)
		if len(phrase2[PHRASE]) == 1:
			freq += phrase2[ADJ_FREQ] * 10
		elif len(phrase2[PHRASE]) < 4:
			freq += phrase2[ADJ_FREQ] * len(phrase2[PHRASE])
		else:
			freq += phrase2[ADJ_FREQ] * pow( len(phrase2[PHRASE]) , 2)
		return freq
		return phrase1[ADJ_FREQ] + phrase2[ADJ_FREQ]
		return phrase1[ADJ_FREQ] * len(phrase1[PHRASE]) + phrase2[ADJ_FREQ] * len(phrase2[PHRASE])
		return phrase1[ADJ_FREQ] * pow( len(phrase1[PHRASE]) , 2.5) + phrase2[ADJ_FREQ] * pow( len(phrase2[PHRASE]) , 2.5)
		return pow (phrase1[ADJ_FREQ], len(phrase1[PHRASE]) / 5.) + pow( phrase2[ADJ_FREQ], len(phrase2[PHRASE]) /5)


	def get_predict_pinyinlist (self, pinyinlist):
		#~ print "Dd", u" ".join( i.get_screen_pinyin() for i in pinyinlist), len(pinyinlist)
		candidates = Editor.database.select_words_by_pinyin_list (pinyinlist)
		if candidates:
			#~ print "phrase1",candidates[0][PHRASE],candidates[0][ADJ_FREQ]
			return [candidates[0]]
		else:
			candidates = Editor.database.select_words_by_pinyin_list_all(pinyinlist)
			if candidates:
				#~ print candidates[0][PHRASE]
				p = list (candidates[0]);
				p[YLEN] = len (pinyinlist)
				p[PHRASE] = p[PHRASE][:p[YLEN]]
				return [p]
		max_freq = 0
		predict = []
		for length in range(len (pinyinlist), 1, -1):
			for i in range (1, length):
				candidates = Editor.database.select_words_by_pinyin_list (pinyinlist[:i])
				if not candidates:
					continue
				candidates2 = Editor.database.select_words_by_pinyin_list(pinyinlist[i:length])
				if not candidates2:
					candidates2 = Editor.database.select_words_by_pinyin_list_all(pinyinlist[i:length])
					if candidates2:
						p = list (candidates2[0]);
						p[YLEN] = length - i
						p[PHRASE] = p[PHRASE][:p[YLEN]]
						tmp_phrase = candidates[0]
						tmp_phrase2 = p
					else:
						continue
				else:
					tmp_phrase = candidates[0]
					tmp_phrase2 = candidates2[0]
				new_freq = self.freq_alg(tmp_phrase, tmp_phrase2)
				#~ print tmp_phrase[PHRASE].encode ("utf-8"),tmp_phrase2[PHRASE].encode ("utf-8"), tmp_phrase[ADJ_FREQ],tmp_phrase2[ADJ_FREQ], new_freq
				#~ if tmp_phrase[ADJ_FREQ] + tmp_phrase2[ADJ_FREQ] >= max_freq:
				if new_freq >= max_freq:
					predict = [tmp_phrase, tmp_phrase2]
					max_freq = new_freq
			if predict:
				break
		if self.predict_len(predict) < len (pinyinlist):
			#~ return
			#~ for i in range(1, predict[0][YLEN]):
				#~ candidates =  Editor.database.select_words_by_pinyin_list(pinyinlist[:i])
				#~ if candidates and candidates[0][PHRASE] == predict[0][PHRASE][:i]:
					#~ print "try", i, candidates[0][PHRASE]
					#~ temp = self.get_predict_pinyinlist(pinyinlist[i:self.predict_len(predict)])
					#~ print "resule", temp[0][PHRASE] +  temp[1][PHRASE]
					#~ print "match", predict[0][PHRASE][i:] + predict[1][PHRASE]
					#~ if predict[0][PHRASE][i:] + predict[1][PHRASE] == (temp[0][PHRASE] + temp[1][PHRASE]):
						#~ print "go", candidates[0][PHRASE]
						#~ return [candidates[0]] + temp
			return predict + self.get_predict_pinyinlist(pinyinlist[self.predict_len(predict):])
		else:
			return predict

	def get_predict (self, pinyinlist):
		if not pinyinlist:
			return []
		candidates = Editor.database.select_words_by_pinyin_list(pinyinlist)
		if candidates:
			#~ print "phrase1",candidates[0][PHRASE],candidates[0][ADJ_FREQ]
			return [candidates[0]]
		else:
			candidates = Editor.database.select_words_by_pinyin_list_all(pinyinlist)
			if candidates:
				p = list (candidates[0]);
				p[YLEN] = len (pinyinlist)
				p[PHRASE] = p[PHRASE][:p[YLEN]]
				return [p]
		max_freq = 0
		max_length =0
		#~ print "try words"
		#~ if longest==1:
			#~ return [Editor.database.select_words_by_pinyin_list(pinyinlist[:1])[0][PHRASE]]
		#~ print longest
		for i in range (1, len(pinyinlist)):
			candidates = Editor.database.select_words_by_pinyin_list (pinyinlist[:i])
			if not candidates:
				continue
			tmp_phrase = candidates[0]
			tmp_freq = tmp_phrase[ADJ_FREQ]
			longest2 = Editor.database.get_longest_phrase_length (pinyinlist[i:])
			#~ print "phrase1",tmp_phrase[PHRASE]
			#~ print i,longest2
			for p in range(i + longest2,i-1,-1):
				if p < max_length:
					continue
				candidates2 = Editor.database.select_words_by_pinyin_list(pinyinlist[i:p+1])
				#~ print len(candidates2)
				if candidates2:
					tmp_phrase2 = candidates2[0]
					#~ print "phrase2",tmp_phrase2[PHRASE]
					tmp_freq2 = tmp_phrase2[ADJ_FREQ]
					#~ print tmp_phrase, " ", candidates2[0][PYSQLiteDB.PHRASE]
					new_freq = self.freq_alg(tmp_phrase, tmp_phrase2)
					if p > max_length or \
						(new_freq >= max_freq and p == max_length):
						predict = [tmp_phrase, tmp_phrase2]
						#~ print tmp_phrase[PHRASE],tmp_phrase2[PHRASE], tmp_phrase[ADJ_FREQ],tmp_phrase2[ADJ_FREQ]
						max_freq = new_freq
						max_length = p
		#~ print "get_predict" + predict[0], max_length
		return predict

	def reparse (self, start):
		#~ print "reparse"
		if start == len (self.wordlist):
			return
		predict = self.get_predict (self.wordlist[start:])
		phrase = predict[0][PHRASE]
		length = len (phrase) 
		#~ if len(phrase)<len(self.wordlist)-start \
					#~ else len(self.wordlist)-start
		#~ print string
		for i in range(0, length):
			if self.wordlist[start+i].manual:
				return
			self.wordlist[start+i].set_char(phrase[i])
		self.reparse (start+length)

	def wordlist_manual_select (self, candidate):
		phrase = candidate[PHRASE]
		for i in range (0, len (phrase) ):
			if i < 4:
				self.wordlist[ self.cursor + i ].real_pinyin_id = candidate[ i + 1 ]
				if self.enable_mohuyin:
					self.wordlist[ self.cursor + i ].mohuyin()
			else:
				py = candidate[YX].split("'")
				self.wordlist[ self.cursor + i ].real_pinyin_id = PINYIN_DICT[ py[ i - 4 ] ]
				if self.enable_mohuyin:
					self.wordlist[ self.cursor + i ].mohuyin()
			self.wordlist[ self.cursor + i ].set_char( phrase[i] )
			self.wordlist[ self.cursor + i ].manual = True
		self.cursor += len (phrase)
		if self.cursor < len (self.wordlist):
			self.reparse (self.cursor);
		self.update ()

	def commit (self):
		if self.pinyinlist:
			self.convert_all ()
		string = self.get_preedit ()
		self.learn ()
		self.clear ()
		return string

	def del_current (self):
		if self.pinyinlist:
			raise InputException ()
		if self.cursor > 0:
			del self.wordlist[self.cursor-1]
			self.cursor -= 1
			if len (self.wordlist) == 0:
				self.clear ()
			self.reparse_backtrace ();
			self.update()
		elif self.wordlist and self.cursor == 0:
			raise InputException()
		

	def del_next (self):
		if self.pinyinlist or self.cursor == len (self.wordlist):
			raise InputException ()
		else:
			del self.wordlist[self.cursor]
			if len (self.wordlist)==0:
				self.clear ()
			self.reparse_backtrace ();
			self.update()

	def move_cursor (self, move):
		if self.is_empty():
			raise InputException()
		if self.pinyinlist and (move<0 or self.candidate_cursor == None):
			raise InputException()
		if move > 0 and self.candidate_cursor != None:
			self.select_cursor()
		else:
			self.cursor += move
			if self.cursor < 0:
				self.cursor += len (self.wordlist) + 1
			elif self.cursor > len (self.wordlist):
				self.cursor = 0
			self.update ()

	def move_cursor_to (self, pos):
		if self.is_empty ():
			raise InputException ()
		if self.pinyinlist:
			self.convert_all ()
		if pos == 0:
			self.cursor = len(self.wordlist)
		elif pos > len(self.wordlist) + 1:
			raise InputException ()
		else:
			self.cursor = pos - 1
		self.update ()

	def select (self, n):
		#~ print self.lookup_table.get_current_page_size()
		if n >= self.lookup_table.get_current_page_size():
			raise InputException()
		candidate = self.candidates[self.lookup_table.get_current_page_start()+n]
		if self.pinyinlist:
			self.pinyin_select(candidate, True)
		else:
			self.wordlist_manual_select(candidate)

	def select_cursor (self):
		candidate = self.candidates[self.lookup_table.get_cursor_pos()]
		#~ print candidate[PHRASE]
		if self.pinyinlist:
			self.pinyin_select(candidate, True)
		else:
			self.wordlist_manual_select(candidate)

	def recursive_mohuyin_pinyinlist(self, pinyinlist):
		for i in self.mohuyin(pinyinlist[0].get_screen_pinyin()):
			if pinyinlist[1:]:
				for p in self.recursive_mohuyin_pinyinlist(pinyinlist[1:]):
					yield [PinYinWord(pinyin = i)] + p
			else:
				yield [PinYinWord(pinyin = i)]

	def recursive_mohuyin(self, strl):
		for i in self.mohuyin(strl[0]):
			if strl[1:]:
				for p in self.recursive_mohu(strl[1:]):
					yield [i] + p
			else:
				yield [i]
	
	def mohuyin(self, pinyin):
		#~ print pinyin
		if pinyin[0] == "'":
			spliter = "'"
			pinyin = pinyin[1:]
		else:
			spliter = ""
		if pinyin[:2] in SHENGMU_LIST:
			shengmu = pinyin[:2]
			yunmu = pinyin[2:]
		elif pinyin[:1] in SHENGMU_LIST:
			shengmu = pinyin[:1]
			yunmu = pinyin[1:]
		else:
			shengmu = ""
			yunmu = pinyin
		if shengmu in self.shengmu_mohu:
			shengmu = self.shengmu_mohu[shengmu]
		else:
			shengmu = [shengmu]
		if yunmu in self.yunmu_mohu:
			yunmu = self.yunmu_mohu[yunmu]
		else:
			yunmu = [yunmu]
		if pinyin in PINYIN_PARTIAL_LIST:
			for q in yunmu:
				if i + q in SHENGMU_LIST or i + q in PINYIN_LIST or i + q in PINYIN_PARTIAL_LIST:
					yield spliter + i + q
		else:
			for i in shengmu:
				for q in yunmu:
					if i + q in SHENGMU_LIST or i + q in PINYIN_LIST:
						yield spliter + i + q
	def parsr_mohuyin(self, pinyinlist):
		candidates = []
		if self.enable_mohuyin:
			ss = Set()
			for p in self.recursive_mohuyin_pinyinlist(pinyinlist):
				#~ print u" ".join( i.get_pinyin() for i in p) 
				for i in range (len (p), 0, -1):
					ss.update(Editor.database.select_words_by_pinyin_list (p[:i]))
			candidates = list(ss)
			candidates.sort(cmp = candidate_sort)
		else:
			for i in range (len (pinyinlist), 0, -1):
				candidates += Editor.database.select_words_by_pinyin_list (pinyinlist[:i])
		return candidates
				
	def update_candidates (self):
		if self.is_empty():
			self.candidates = []
		elif self.pinyinlist:
			self.candidates = self.parsr_mohuyin(self.pinyinlist)
		elif len(self.wordlist)>self.cursor:
			self.candidates = self.parsr_mohuyin(self.wordlist[self.cursor:])
		else:
			self.candidates = []
		self.update_lookup_table()

	def update_lookup_table (self):
		self.lookup_table.clear()
		self.lookup_table.show_cursor(False)
		for c in self.candidates:
			if  c[ADJ_FREQ] == 0 or c[ADJ_FREQ]%USER_PHRASE and c[ADJ_FREQ]%USER_WORD:
				self.lookup_table.append_candidate (c[PHRASE])
			else:
				attrs = [scim.Attribute (0, len(c[PHRASE]), scim.ATTR_FOREGROUND, RGB (0, 0, 0xef))]
				self.lookup_table.append_candidate (c[PHRASE], attrs)

class Engine (IMEngine):
	def __init__ (self, factory, config, encoding, id):
		IMEngine.__init__ (self, factory, config, encoding, id)
		self._editor = Editor ()
		self._lookup_table = scim.LookupTable (9)
		self._status_property = Property ("chinese", "CN")
		self._setup_property = Property ("setup", "", "/usr/share/scim/icons/setup.png")
		self._chinese_mode = True
		self.reload_config(config)
		self._log = scim.Log.Log ("ZhengJu")
		#~ print "init"
	def clear(self):
		pass
	def reset(self):
		#~ print "reset"
		if self._editor.wordlist:
			self.commit_string (self._editor.commit())
		else:
			self._editor.clear()
		self.clear()
		self.origin_string = None
		self._double_quotation_state = False
		self._single_quotation_state = False
		self._prev_key = None
		self._shift_key = None	
		self.pipe = None	
		self.update ()	
		props = [self._status_property, self._setup_property]
		self.register_properties (props)
		self.update_properties ()
	def update_preedit (self):
		string = self._editor.get_preedit () + self.get_extra_string()
		if (string == u""):
			self.hide_preedit_string ()
		else:
			self.show_preedit_string ()
			self.update_preedit_string (string , [])
			self.update_preedit_caret (self._editor.get_screen_cursor())

	def focus_out(self):
		#~ print "out reset"
		self.reset()
		IMEngine.focus_out (self)
		
	def focus_in (self):
		#~ print "in reset"
		self.reset()
		IMEngine.focus_in (self)
	
	def trigger_property (self, property):
		if property == "chinese":
			self.change_mode ()
		elif property == "setup":
			self.start_helper ("61af6de6-c29d-421e-9e1b-e34a29c68c76")

	def update_candidate (self):
		if self._editor.candidates:
			self.update_lookup_table(self._editor.lookup_table)
			self.show_lookup_table()
		else:
			self.hide_lookup_table ()

	def update_aux(self):
		if self._editor.predict:
			self.show_aux_string ()
			string = self._editor.get_aux ()
			attrs = [scim.Attribute (0, len (string), scim.ATTR_FOREGROUND, RGB (0, 0, 0xef))]
			self.update_aux_string (string, attrs)
		else:
			self.hide_aux_string ()
			self.update_aux_string (u"")

	def update (self):
		self.update_preedit ()
		self.update_aux ()
		self.update_candidate ()

	def update_properties (self):
		if self._chinese_mode: # refresh mode
			self._status_property.label = _("CN")
		else:
			self._status_property.label = _("EN")
		self.update_property(self._status_property)

	def change_mode(self):
		if self._chinese_mode:
			self.commit_string (self._editor.commit())
			self.update()
		self._chinese_mode = not self._chinese_mode
		self.update_properties ()
		#~ print "change_mode", self._chinese_mode
	def reload_config (self, config):
		self._editor.load_config(config)
		self.progresivepromp = config.read ("/IMEngine/Python/ZhengJu/ProgressivePrompt", False)

	def lookup_table_page_down (self):
		self._editor.lookup_table.page_down ();
		self.update()
		return True

	def lookup_table_page_up (self):
		self._editor.lookup_table.page_up ();
		self.update()
		return True
		
	def process_key_event (self, key):
		#~ print key.code
		if self._chinese_mode and self._editor.is_end() and not self.get_extra_string():
			if (key.code == KeyCode.KEY_Shift_L or key.code == KeyCode.KEY_Shift_R)	\
				and key.mask & KeyMask.ReleaseMask:
				if self._shift_key:
					self.change_mode()
					return True
			if key.mask == KeyMask.ShiftMask and (key.code >= KeyCode.KEY_A and key.code <= KeyCode.KEY_Z) \
				or key.mask & KeyMask.CapsLockMask:
				self.change_mode()
		elif not self.get_extra_string():
			if (key.code == KeyCode.KEY_Shift_L or key.code == KeyCode.KEY_Shift_R)\
				and key.mask & KeyMask.ReleaseMask:
				if self._shift_key:
					self.change_mode()
					return True
		if (key.code == KeyCode.KEY_Shift_L or key.code == KeyCode.KEY_Shift_R)	\
			and key.mask == KeyMask.NullMask:
			self._shift_key = True;
		else:
			self._shift_key = None;
		if self.pipe and self.pipe.poll() != -1:
			try: 
				self.origin_string = unicode(self.pipe.fromchild.read()[:-1],"utf8")
				self._editor.reverse(self.origin_string)
				self._editor.move_cursor_to (1)
			except:
				self._editor.clear()
				self.beep ()
			else:
				self.commit_string(u"")
				self.update()			
			finally:
				self.pipe = None
				return True
		if key.mask & KeyMask.ReleaseMask:
			return False
		try:
			if self._chinese_mode:
				result = self.chinese_process_key_event (key)
			else:
				result = self.english_process_key_event (key)
			self._prev_key = key
		except InputException, e:
			self.beep ()
			return True
		except Exception, e:
			self.beep ()
			self._log.print_exc()
			self._log.log("DEBUG", self._editor.cursor)
			self._log.log("DEBUG", [i.char.encode("utf-8") for i in self._editor.wordlist] )
			self._log.log("DEBUG", [i.get_screen_pinyin() for i in self._editor.pinyinlist] )
			self._log.log("DEBUG", self._editor.get_preedit().encode ("utf-8"))
			return True
		else:
			return result	
		finally:
			self.update()

	def english_process_key_event (self, key):
		return False

	def chinese_process_key_event (self, key):
		if self._editor.is_empty() and not self.get_extra_string():
			if key.code <= 127 and ascii.ispunct (chr (key.code)):
				self.commit_string (self.convert_to_full_width (unichr (key.code)))
				return True
			elif key.code == KeyCode.KEY_r and key.mask == KeyMask.ControlMask:
				if not self.pipe:
					self.pipe = popen2.Popen3("python -c" +'"import gtk; print gtk.clipboard_get(selection=\\"PRIMARY\\").wait_for_text()"')
				return True
			else:
				return False
		#~ print unichr (key.code)
		if key.code in (KeyCode.KEY_Control_L,KeyCode.KEY_Control_R,
			KeyCode.KEY_Alt_L, KeyCode.KEY_Alt_R):
			return True
		elif key.code in (KeyCode.KEY_KP_Space, KeyCode.KEY_space):
			#~ print self._editor.get_candidate_cursor()
			if self._editor.candidates and self._editor.lookup_table.is_cursor_visible():
				self._editor.select_cursor()
				return True
			elif self._editor.pinyinlist:
				self._editor.convert_all ()
				return True
			elif self._editor.cursor < len (self._editor.wordlist):
				self._editor.jump_to_next_word()
				return True
			else:
				self.commit_string (self._editor.commit())
				return True
		elif key.code == KeyCode.KEY_BackSpace:
			if not self._editor.pinyinlist and self.get_extra_string():
				raise InputException()
			self._editor.del_current()
			return True
		elif key.code == KeyCode.KEY_Delete:
			if self._editor.lookup_table.is_cursor_visible():
				self._editor.delete_cursor_phrase ()
			else:
				self._editor.del_next ()
			return True
		elif key.code >= KeyCode.KEY_0 and key.code <= KeyCode.KEY_9 and key.mask & KeyMask.ControlMask:
			self._editor.delete_phrase (key.code - KeyCode.KEY_1)
			return True
		elif key.code >= KeyCode.KEY_0 and key.code <= KeyCode.KEY_9 and key.mask & KeyMask.AltMask:
			self._editor.move_cursor_to (key.code - KeyCode.KEY_0)
			return True
		elif key.code >= KeyCode.KEY_1 and key.code <= KeyCode.KEY_9:
			self._editor.select (key.code-KeyCode.KEY_1)
			return True
		elif key.code >= KeyCode.KEY_KP_1 and key.code <= KeyCode.KEY_KP_9:
			self._editor.select (key.code-KeyCode.KEY_KP_1)
			return True
		elif key.code == KeyCode.KEY_Shift_L:
			if not self._editor.is_end():
				self._editor.select (0)
				self._shift_key = None
			return True
		elif key.code == KeyCode.KEY_Shift_R:
			if not self._editor.is_end():
				self._editor.select (1)
				self._shift_key = None
			return True
		elif key.code in (KeyCode.KEY_equal, KeyCode.KEY_bracketright, KeyCode.KEY_Page_Down):
			if self._editor.candidates:
				self._editor.lookup_table.page_down ();
				return True
			else:
				raise InputException()				
		elif key.code in (KeyCode.KEY_minus, KeyCode.KEY_bracketleft, KeyCode.KEY_Page_Up):
			if self._editor.candidates:
				self._editor.lookup_table.page_up ();
				return True
			else:
				raise InputException()
		elif key.code==KeyCode.KEY_Up:
			if self._editor.candidates:
				self._editor.lookup_table.cursor_up()
				self._editor.lookup_table.show_cursor(True)
				return True
			else:
				raise InputException()
		elif key.code==KeyCode.KEY_Down:
			if self._editor.candidates:
				self._editor.lookup_table.cursor_down()
				self._editor.lookup_table.show_cursor(True)
				return True
			else:
				raise InputException()
		elif key.code == KeyCode.KEY_Left or key.code == KeyCode.KEY_b and key.mask & KeyMask.ControlMask:
			self._editor.move_cursor (-1)
			return True
		elif key.code == KeyCode.KEY_Right or key.code == KeyCode.KEY_f and key.mask & KeyMask.ControlMask:
			if self.get_extra_string():
				raise InputException()
			self._editor.move_cursor (1)
			return True
		elif key.code == KeyCode.KEY_h and key.mask & KeyMask.ControlMask or key.code == KeyCode.KEY_Home:
			if self.get_extra_string():
				raise InputException()
			self._editor.move_cursor_to (1)
			return True
		elif key.code == KeyCode.KEY_e and key.mask & KeyMask.ControlMask or key.code == KeyCode.KEY_End:
			if self.get_extra_string():
				raise InputException()
			self._editor.move_cursor_to (0)
			return True
		elif key.code in (KeyCode.KEY_Return, KeyCode.KEY_KP_Enter):
			self.commit_string (self._editor.commit() + self.get_extra_string())
			self.clear()
			return True
		elif key.code == KeyCode.KEY_Escape or key.code == KeyCode.KEY_c and key.mask & KeyMask.ControlMask:
			if self.origin_string:
				self.commit_string(self.origin_string)
				self._editor.clear()
				self.origin_string = None
			elif self._editor.lookup_table.is_cursor_visible():
				self._editor.lookup_table.show_cursor(False)
				self._editor.update()
			else:
				self.clear()
				self._editor.clear()
			return True
		elif key.code <= 127 and ascii.ispunct (chr (key.code)) and not self.get_extra_string():
			if not self._editor.is_empty ():
				self.commit_string (self._editor.commit ())
			self.commit_string (self.convert_to_full_width (unichr (key.code)))
			return True
		else:
			raise InputException ()

	def convert_to_full_width (self, c):
		if c == u".":
			if self._prev_key and self._prev_key.code >= KeyCode.KEY_0 and self._prev_key.code <= KeyCode.KEY_9:
				return u"."
			else:
				return u"\u3002"
		elif c == u"\\":
			return u"\u3001"
		elif c == u"^":
			return u"\u2026\u2026"
		elif c == u"_":
			return u"\u2014\u2014"
		elif c == u"$":
			return u"\uffe5"
		elif c == u"\"":
			self._double_quotation_state = not self._double_quotation_state
			if self._double_quotation_state:
				return u"\u201c"
			else:
				return u"\u201d"
		elif c == u"'":
			self._single_quotation_state = not self._single_quotation_state
			if self._single_quotation_state:
				return u"\u2018"
			else:
				return u"\u2019"
		elif c == u"<":
			return u"\u300a"
		elif c == u">":
			return u"\u300b"
		return scim.unichar_half_to_full (c)

class ZhengJuFactory (IMEngineFactory):
	def __init__ (self, config):
		IMEngineFactory.__init__ (self, config)
		self.name 		= _(u"ZhengJu")
		self.uuid 		= "59e29ad8-3c95-4cd0-b02f-e21bf1317f7a"
		self.authors	= u"Yu Fan <yufanyufan@gmail.com>"
		self.icon_file 	= "/usr/share/scim/icons/scim-python.png"
		self.credits 	= u"GPL"
		self.help		= _(u"Help For ZhengJu")
		self.set_languages ("zh")
		self._config	= config

	def create_instance (self, encoding, id):
		pinyin = self._config.read ("/IMEngine/Python/ZhengJu/PinYinSchema", "JianPin")
		if pinyin == "JianPin":
			import JianPin
			engine = JianPin.JianPinEngine (self, self._config, encoding, id)
		elif pinyin == "QuanPin":
			import QuanPin
			engine = QuanPin.QuanPinEngine (self, self._config, encoding, id)
		elif pinyin == "ShuangPin":
			import ShuangPin
			engine = ShuangPin.ShuangPinEngine (self, self._config, encoding, id)
		else:
			import JianPin
			engine = JianPin.JianPinEngine (self, self._config, encoding, id)			
		return engine

	def reload_config (self, config):
		self._config	= config
		
class PseudoConfig:
	def read(self, string, default):
		return default;

def train(file_name):
	print "Training by " + file_name
	editor = Editor()
	import re
	ex = re.compile(ur"[\da-zA-Z\W]",re.UNICODE)
	for l in file(file_name):
		ll = unicode(l,"utf8")[:-1]
		t = ex.split(ll)
		for i in t:
			if i:
				try:
					editor.reverse(i)
					#~ print i
					editor.learn()
				except:
					print file
					traceback.print_exc ()
					raise Exception()				

def print_usage():
	print "ZhengJu -f FILE\tRead Sentenc from file"
	print "ZhengJu \tConvert parameter to pinyin"

if __name__ == "__main__":
	editor = Editor()
	import sys
	try:
		if len(sys.argv) == 3:
			if sys.argv[1] == "-f":
				train(sys.argv[2])
			else:
				raise Exception()
		elif len(sys.argv) == 2:
			try:
				editor.reverse(unicode(sys.argv[1],"utf8"))
				for i in editor.wordlist:
					print ID_PINYIN_DICT[i.get_pinyin_id()],
					print i.char,
			except:
				print "Can't convert this to pinyin"
				raise Exception()
		else:
			raise Exception()
	except:
		traceback.print_exc ()
		print_usage()
		sys.exit(1)
	else:
		sys.exit(0)