File: import_sogou_celldict.py

package info (click to toggle)
sunpinyin 3.0.0~rc1%2Bds1-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,944 kB
  • sloc: cpp: 13,586; python: 923; makefile: 198
file content (83 lines) | stat: -rwxr-xr-x 2,259 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/python3

# thanks for the reverse engineering efforts of following projects/peoples:
# http://code.google.com/p/imewlconverter
# http://code.google.com/p/ibus-cloud-pinyin
# http://forum.ubuntu.org.cn/viewtopic.php?f=8&t=250136&start=0

from importer import import_to_sunpinyin_user_dict
import struct
import os, sys

def read_utf16_str (f, offset=-1, len=2):
    if offset >= 0:
        f.seek(offset)
    str = f.read(len)
    return str.decode('UTF-16LE')

def read_uint16 (f):
    return struct.unpack ('<H', f.read(2))[0]

def get_word_from_sogou_cell_dict (fname):
    f = open (fname, 'rb')
    file_size = os.path.getsize (fname)
    
    hz_offset = 0
    mask = struct.unpack ('B', f.read(128)[4])[0]
    if mask == 0x44:
        hz_offset = 0x2628
    elif mask == 0x45:
        hz_offset = 0x26c4
    else:
        sys.exit(1)
    
    title   = read_utf16_str (f, 0x130, 0x338  - 0x130)
    type    = read_utf16_str (f, 0x338, 0x540  - 0x338)
    desc    = read_utf16_str (f, 0x540, 0xd40  - 0x540)
    samples = read_utf16_str (f, 0xd40, 0x1540 - 0xd40)
    
    py_map = {}
    f.seek(0x1540+4)
    
    while 1:
        py_code = read_uint16 (f)
        py_len  = read_uint16 (f)
        py_str  = read_utf16_str (f, -1, py_len)
    
        if py_code not in py_map:
            py_map[py_code] = py_str
    
        if py_str == 'zuo':
            break
    
    f.seek(hz_offset)
    while f.tell() != file_size:
        word_count   = read_uint16 (f)
        pinyin_count = read_uint16 (f) / 2
    
        py_set = []
        for i in range(pinyin_count):
            py_id = read_uint16(f)
            py_set.append(py_map[py_id])
        py_str = "'".join (py_set)
    
        for i in range(word_count):
            word_len = read_uint16(f)
            word_str = read_utf16_str (f, -1, word_len)
            f.read(12)  # simply ignore word frequence info
            yield py_str, word_str

    f.close()

def main ():
    if len (sys.argv) != 2:
        print("Please specify the Sogou PinYin Cell dict file!")
        exit (1)

    generator = get_word_from_sogou_cell_dict (sys.argv[1])
    import_to_sunpinyin_user_dict (generator)

if __name__ == "__main__":
    main()

# -*- indent-tabs-mode: nil -*- vim:et:ts=4