File: importer.py

package info (click to toggle)
sunpinyin 3.0.0~rc1%2Bds1-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,944 kB
  • sloc: cpp: 13,586; python: 923; makefile: 198
file content (138 lines) | stat: -rw-r--r-- 4,290 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/python3
import os, sys
import struct
import sqlite3 as sqlite
from pinyin_data import valid_syllables, decode_syllable, initials, finals

def get_userdict_path ():
    homedir = os.environ.get("HOME")

    if sys.platform == "darwin":
        return homedir+"/Library/Application Support/SunPinyin/userdict"

    # FIXME: not sure how to get the ibus version or wrapper type (xim or ibus)
    if os.path.exists (homedir+"/.cache/ibus/sunpinyin"):
        return homedir+"/.cache/ibus/sunpinyin/userdict"
        
    if os.path.exists (homedir+"/.ibus/sunpinyin"):
        return homedir+"/.ibus/sunpinyin/userdict"
    
    if os.path.exists (homedir+"/.sunpinyin"):
        return homedir+"/.sunpinyin/userdict"

    raise "Can not detect sunpinyin's userdict!"

def get_sysdict_path ():
    if sys.platform == "darwin":
        homedir = os.environ.get("HOME")
        sysdict_path = "/Library/Input Methods/SunPinyin.app/Contents/Resources/pydict_sc.bin"
        if os.path.exists (homedir + sysdict_path):
            return homedir + sysdict_path
        else:
            return sysdict_path

    return "/usr/lib/sunpinyin/data/pydict_sc.bin"

def load_system_dict ():
    sysdict_path = get_sysdict_path ()
    f = open (sysdict_path, "rb")
    
    f.seek(8)
    word_offset = struct.unpack ('I', f.read(4))[0]
    f.seek (word_offset)

    words = set()
    str = f.read()
    
    for w in str.decode('UTF-32').split('\0'):
        if w:
            words.add (w)
    
    f.close()
    return words

def import_to_sunpinyin_user_dict (records, userdict_path=''):
    userdict_path = userdict_path if userdict_path else get_userdict_path()
    db = sqlite.connect (userdict_path)

    sysdict = load_system_dict()

    sqlstring = """
            CREATE TABLE IF NOT EXISTS dict(
            id INTEGER PRIMARY KEY, len INTEGER,
            i0 INTEGER, i1 INTEGER, i2 INTEGER, i3 INTEGER, i4 INTEGER, i5 INTEGER,
            f0 INTEGER, f1 INTEGER, f2 INTEGER, f3 INTEGER, f4 INTEGER, f5 INTEGER,
            utf8str TEXT, UNIQUE (utf8str));
            """
    db.executescript (sqlstring)

    batch_count = 0

    for (pystr, utf8str) in records:
        try:
            syllables = [valid_syllables[s] for s in pystr.split("'")]
        except:
            print("[%s] has un-recognized syllables, ignoring this record!" % pystr)
            continue

        if len (syllables) < 2 or len (syllables) > 6:
            print("[%s] is too long or too short for sunpinyin userdict" % utf8str)
            continue

        if utf8str in sysdict:
            #print("[%s] is already in sunpinyin's sysdict" % utf8str)
            continue

        record = [0]*14
        record[0] = len (syllables)
        record[13] = utf8str

        c = 1
        for s in syllables:
            i, f = s>>12, (s&0x00ff0)>>4
            if i and not f:
                break; 
            record[c] = i
            record[c+1] = f
            c += 2
        else:
            sqlstring = """
                    INSERT INTO dict (len, i0, f0, i1, f1, i2, f2, i3, f3, i4, f4, i5, f5, utf8str)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
                    """
            try:
                db.execute (sqlstring, record)
                #print("[%s] is imported into sunpinyin's userdict" % utf8str)

                batch_count += 1
                if batch_count == 100:
                    db.commit ()
                    batch_count = 0

            except:
                #print("[%s] is already in sunpinyin's userdict" % utf8str)
                pass

    db.commit()
    db.close()

def export_sunpinyin_user_dict (userdict_path=''):
    userdict_path = userdict_path if userdict_path else get_userdict_path()
    db = sqlite.connect (userdict_path)

    sqlstring = "SELECT * FROM dict"
    result = list (db.execute (sqlstring).fetchall ())

    for record in result:
        id   = record[0]
        l    = record[1]
        i    = record[2:8]
        f    = record[8:14]
        str  = record[-1]
        syls = [initials[i[x]] + finals[f[x]] for x in range(l)]
        print(str.encode ('UTF-8'), id, "'".join(syls))
        
if __name__ == "__main__":
    export_sunpinyin_user_dict ()

# -*- indent-tabs-mode: nil -*- vim:et:ts=4