File: importer.py

package info (click to toggle)
sunpinyin 2.0.3%2Bgit20120607-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 1,372 kB
  • sloc: cpp: 14,549; python: 1,309; makefile: 154
file content (136 lines) | stat: -rw-r--r-- 4,239 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/python
import os, sys
import struct
import sqlite3 as sqlite
from pinyin_data import valid_syllables, decode_syllable, initials, finals

def get_userdict_path ():
    homedir = os.environ.get("HOME")

    if sys.platform == "darwin":
        return homedir+"/Library/Application Support/SunPinyin/userdict"

    # FIXME: not sure how to get the ibus version or wrapper type (xim or ibus)
    if os.path.exists (homedir+"/.cache/ibus/sunpinyin"):
        return homedir+"/.cache/ibus/sunpinyin/userdict"
        
    if os.path.exists (homedir+"/.ibus/sunpinyin"):
        return homedir+"/.ibus/sunpinyin/userdict"
    
    if os.path.exists (homedir+"/.sunpinyin"):
        return homedir+"/.sunpinyin/userdict"

    raise "Can not detect sunpinyin's userdict!"

def get_sysdict_path ():
    if sys.platform == "darwin":
        homedir = os.environ.get("HOME")
        sysdict_path = "/Library/Input Methods/SunPinyin.app/Contents/Resources/pydict_sc.bin"
        if os.path.exists (homedir + sysdict_path):
            return homedir + sysdict_path
        else:
            return sysdict_path

    return "/usr/lib/sunpinyin/data/pydict_sc.bin"

def load_system_dict ():
    sysdict_path = get_sysdict_path ()
    f = open (sysdict_path, "rb")
    
    f.seek(8)
    word_offset = struct.unpack ('I', f.read(4))[0]
    f.seek (word_offset)

    words = set()
    str = f.read()
    
    for w in str.decode('UTF-32').split('\0'):
        if w:
            words.add (w)
    
    f.close()
    return words

def import_to_sunpinyin_user_dict (records, userdict_path=''):
    userdict_path = userdict_path if userdict_path else get_userdict_path()
    db = sqlite.connect (userdict_path)

    sysdict = load_system_dict()

    sqlstring = """
            CREATE TABLE IF NOT EXISTS dict(
            id INTEGER PRIMARY KEY, len INTEGER,
            i0 INTEGER, i1 INTEGER, i2 INTEGER, i3 INTEGER, i4 INTEGER, i5 INTEGER,
            f0 INTEGER, f1 INTEGER, f2 INTEGER, f3 INTEGER, f4 INTEGER, f5 INTEGER,
            utf8str TEXT, UNIQUE (utf8str));
            """
    db.executescript (sqlstring)

    batch_count = 0

    for (pystr, utf8str) in records:
        try:
            syllables = [valid_syllables[s] for s in pystr.split("'")]
        except:
            print "[%s] has un-recognized syllables, ignoring this record!" % pystr
            continue

        if len (syllables) < 2 or len (syllables) > 6:
            print "[%s] is too long or too short for sunpinyin userdict" % utf8str
            continue

        if utf8str in sysdict:
            #print "[%s] is already in sunpinyin's sysdict" % utf8str
            continue

        record = [0]*14
        record[0] = len (syllables)
        record[13] = utf8str

        c = 1
        for s in syllables:
            i, f = s>>12, (s&0x00ff0)>>4
            if i and not f:
                break; 
            record[c] = i
            record[c+1] = f
            c += 2
        else:
            sqlstring = """
                    INSERT INTO dict (len, i0, f0, i1, f1, i2, f2, i3, f3, i4, f4, i5, f5, utf8str)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
                    """
            try:
                db.execute (sqlstring, record)
                #print "[%s] is imported into sunpinyin's userdict" % utf8str

                batch_count += 1
                if batch_count == 100:
                    db.commit ()
                    batch_count = 0

            except:
                #print "[%s] is already in sunpinyin's userdict" % utf8str
                pass

    db.commit()
    db.close()

def export_sunpinyin_user_dict (userdict_path=''):
    userdict_path = userdict_path if userdict_path else get_userdict_path()
    db = sqlite.connect (userdict_path)

    sqlstring = "SELECT * FROM dict"
    result = list (db.execute (sqlstring).fetchall ())

    for record in result:
        id   = record[0]
        l    = record[1]
        i    = record[2:8]
        f    = record[8:14]
        str  = record[-1]
        syls = [initials[i[x]] + finals[f[x]] for x in range(l)]
        print str.encode ('UTF-8'), id, "'".join(syls) 
        
if __name__ == "__main__":
    export_sunpinyin_user_dict ()