1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
|
# -*- coding: utf-8 -*-
# vim:set et sts=4 sw=4:
#
# libpinyin - Library to deal with pinyin.
#
# Copyright (C) 2011 Peng Wu <alexepico@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
import os
import sys
import math
import pinyin
pinyin_list = sorted(pinyin.PINYIN_LIST)
shengmu_list = sorted(pinyin.SHENGMU_LIST)
yunmu_list = sorted(pinyin.YUNMU_LIST)
phrase_dict = {}
def load_phrase(filename):
phrasefile = open(filename, "r")
for line in phrasefile.readlines():
line = line.rstrip(os.linesep)
(pinyin_str, freq) = line.split(None, 1)
freq = int(freq)
if 0 == freq:
#print(pinyin_str)
continue
# no duplicate here
if "'" in pinyin_str:
(first_key, second_key) = pinyin_str.split("'")
phrase_dict[(first_key, second_key)] = freq
else:
phrase_dict[pinyin_str] = freq
phrasefile.close()
def gen_all_divided():
for pinyin_key in pinyin_list:
for first_key in pinyin_list:
if len(pinyin_key) <= len(first_key):
continue
if not pinyin_key.startswith(first_key):
continue
second_key = pinyin_key[len(first_key):]
if second_key in pinyin_list:
yield pinyin_key, first_key, second_key
def filter_divided():
for (pinyin_key, first_key, second_key) in gen_all_divided():
if not (first_key, second_key) in phrase_dict:
continue
orig_freq = 0
if pinyin_key in phrase_dict:
orig_freq = phrase_dict[pinyin_key]
new_freq = phrase_dict[(first_key, second_key)]
yield pinyin_key, orig_freq, first_key, second_key, new_freq
def gen_all_resplit():
for pinyin_key in pinyin_list:
if pinyin_key[-1] in ["n", "g", "r"]:
for yun in yunmu_list:
if yun not in pinyin_list:
continue
#check first new pinyin key
if not pinyin_key[:-1] in pinyin_list:
continue
#check second new pinyin key
new_pinyin_key = pinyin_key[-1] + yun
if new_pinyin_key in pinyin_list:
yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
'''
elif pinyin_key[-1] in ["e"]:
#check first new pinyin key
if pinyin_key[:-1] in pinyin_list:
yield pinyin_key, "r", pinyin_key[:-1], "er"
'''
def filter_resplit():
for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
in gen_all_resplit():
#do the reverse here, as libpinyin pinyin parser is different with
#ibus-pinyin's parser.
(orig_first_key, orig_second_key, new_first_key, new_second_key) = \
(new_first_key, new_second_key, orig_first_key, orig_second_key)
if (new_first_key, new_second_key) not in phrase_dict:
continue
orig_freq = 0
new_freq = phrase_dict[(new_first_key, new_second_key)]
if (orig_first_key, orig_second_key) in phrase_dict:
orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
yield orig_first_key, orig_second_key, orig_freq, \
new_first_key, new_second_key, new_freq
#init code
load_phrase("pinyins.txt")
load_phrase("specials.txt")
if __name__ == "__main__":
for p in filter_divided():
print (p)
for p in filter_resplit():
print (p)
|