1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
|
#!/usr/bin/env python3
from pydict import *
from id import *
from valid_hanzi import *
import sys
from functools import cmp_to_key
def get_sheng_yun(pinyin):
if pinyin == None:
return None, None
if pinyin == "ng":
return "", "en"
for i in range(2, 0, -1):
t = pinyin[:i]
if t in SHENGMU_DICT:
return t, pinyin[len(t):]
return "", pinyin
def read_phrases(filename):
buf = open(filename,encoding='utf-16').read()
buf = buf.strip()
for l in buf.split('\n'):
hanzi, freq, flag, pinyin = l.split(' ', 3)
freq = float(freq)
pinyin = pinyin.split()
if any([c not in valid_hanzi for c in hanzi]):
continue
yield hanzi, freq, pinyin
def create_db(filename):
# import sqlite3
# con = sqlite3.connect("main.db")
# con.execute ("PRAGMA synchronous = NORMAL;")
# con.execute ("PRAGMA temp_store = MEMORY;")
# con.execute ("PRAGMA default_cache_size = 5000;")
print("PRAGMA synchronous = NORMAL;")
print("PRAGMA temp_store = MEMORY;")
print("PRAGMA default_cache_size = 5000;")
sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s);"
for i in range(0, 16):
column = []
for j in range(0, i + 1):
column.append ("s%d INTEGER" % j)
column.append ("y%d INTEGER" % j)
print(sql % (i, ",".join(column)))
# con.execute(sql % (i, column))
# con.commit()
records = list(read_phrases(filename))
records.sort (key = cmp_to_key (lambda a, b: 1 if a[1] > b[1] else -1))
records_new = []
i = 0
max_freq = 0.0
for hanzi, freq, pinyin in records:
if max_freq / freq < 1 - 0.001:
max_freq = freq
i = i + 1
records_new.append((hanzi, i, pinyin))
records_new.reverse()
print("BEGIN;")
insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);"
for hanzi, freq, pinyin in records_new:
columns = []
for py in pinyin:
s, y = get_sheng_yun(py)
s, y = pinyin_id[s], pinyin_id[y]
columns.append(s)
columns.append(y)
values = "'%s', %d, %s" % (hanzi, freq, ",".join(map(str,columns)))
sql = insert_sql % (len(hanzi) - 1, values)
print(sql)
print("COMMIT;")
print("VACUUM;")
def main():
create_db(sys.argv[1])
if __name__ == "__main__":
main()
|