File: create_db.py

package info (click to toggle)
libpyzy 1.0.1-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 34,544 kB
  • sloc: python: 23,416; cpp: 20,929; makefile: 346; sql: 94; php: 16; sh: 16
file content (82 lines) | stat: -rwxr-xr-x 2,435 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python3
from pydict import *
from id import *
from valid_hanzi import *
import sys
from functools import cmp_to_key

def get_sheng_yun(pinyin):
    if pinyin == None:
        return None, None
    if pinyin == "ng":
        return "", "en"
    for i in range(2, 0, -1):
        t = pinyin[:i]
        if t in SHENGMU_DICT:
            return t, pinyin[len(t):]
    return "", pinyin

def read_phrases(filename):
    buf = open(filename,encoding='utf-16').read()
    buf = buf.strip()
    for l in buf.split('\n'):
        hanzi, freq, flag, pinyin = l.split(' ', 3)
        freq = float(freq)
        pinyin = pinyin.split()
        if any([c not in valid_hanzi for c in hanzi]):
            continue
        yield hanzi, freq, pinyin

def create_db(filename):
    # import sqlite3
    # con = sqlite3.connect("main.db")
    # con.execute ("PRAGMA synchronous = NORMAL;")
    # con.execute ("PRAGMA temp_store = MEMORY;")
    # con.execute ("PRAGMA default_cache_size = 5000;")
    print("PRAGMA synchronous = NORMAL;")
    print("PRAGMA temp_store = MEMORY;")
    print("PRAGMA default_cache_size = 5000;")


    sql = "CREATE TABLE py_phrase_%d (phrase TEXT, freq INTEGER, %s);"
    for i in range(0, 16):
        column = []
        for j in range(0, i + 1):
            column.append ("s%d INTEGER" % j)
            column.append ("y%d INTEGER" % j)
        print(sql % (i, ",".join(column)))
        # con.execute(sql % (i, column))
        # con.commit()

    records = list(read_phrases(filename))
    records.sort (key = cmp_to_key (lambda a, b: 1 if a[1] > b[1] else -1))
    records_new = []
    i = 0
    max_freq = 0.0
    for hanzi, freq, pinyin in records:
        if max_freq / freq <  1 - 0.001:
            max_freq = freq
            i = i + 1
        records_new.append((hanzi, i, pinyin))
    records_new.reverse()
    
    print("BEGIN;")
    insert_sql = "INSERT INTO py_phrase_%d VALUES (%s);"
    for hanzi, freq, pinyin in records_new:
        columns = []
        for py in pinyin:
            s, y = get_sheng_yun(py)
            s, y = pinyin_id[s], pinyin_id[y]
            columns.append(s)
            columns.append(y)
        values = "'%s', %d, %s" % (hanzi, freq, ",".join(map(str,columns)))
        sql = insert_sql % (len(hanzi) - 1, values)
        print(sql)
    print("COMMIT;")
    print("VACUUM;")

def main():
    create_db(sys.argv[1])
 
if __name__ == "__main__":
    main()