File: pinyin_data.py

package info (click to toggle)
sunpinyin 3.0.0~rc1%2Bds1-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 1,944 kB
  • sloc: cpp: 13,586; python: 923; makefile: 198
file content (124 lines) | stat: -rwxr-xr-x 7,039 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/python3

# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
# 
# Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
# 
# The contents of this file are subject to the terms of either the GNU Lesser
# General Public License Version 2.1 only ("LGPL") or the Common Development and
# Distribution License ("CDDL")(collectively, the "License"). You may not use this
# file except in compliance with the License. You can obtain a copy of the CDDL at
# http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
# http://www.opensource.org/licenses/lgpl-license.php. See the License for the 
# specific language governing permissions and limitations under the License. When
# distributing the software, include this License Header Notice in each file and
# include the full text of the License in the License file as well as the
# following notice:
# 
# NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
# (CDDL)
# For Covered Software in this distribution, this License shall be governed by the
# laws of the State of California (excluding conflict-of-law provisions).
# Any litigation relating to this License shall be subject to the jurisdiction of
# the Federal Courts of the Northern District of California and the state courts
# of the State of California, with venue lying in Santa Clara County, California.
# 
# Contributor(s):
# 
# If you wish your version of this file to be governed by only the CDDL or only
# the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
# include this software in this distribution under the [CDDL or LGPL Version 2.1]
# license." If you don't indicate a single choice of license, a recipient has the
# option to distribute your version of this file under either the CDDL or the LGPL
# Version 2.1, or to extend the choice of license to its licensees as provided
# above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
# Version 2 license, then the option applies only if the new code is made subject
# to such option by the copyright holder. 

initials = ["", "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w", "hm", ]

finals = ["", "a", "o", "e", "ai", "ei", "ao", "ou", "an", "en", "ang", "eng", "er", "i", "ia", "ie", "iao", "iu", "ian", "in", "iang", "ing", "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ong", "v", "ve", "ue", "iong", "ng" ]

inner_fuzzy_finals = ['ia', 'iao', 'ian', 'iang', 'ie', 'ua', 'uai', 'uan', 'uang', 'ue', 've']

fuzzy_pairs = [
    ('z',       'zh'), 
    ('c',       'ch'), 
    ('s',       'sh'), 
    ('an',      'ang'), 
    ('on',      'ong'), 
    ('en',      'eng'), 
    ('in',      'ing'), 
    ('eng',     'ong'), 
    ('ian',     'iang'), 
    ('uan',     'uang'), 
    ('l',       'n'), 
    ('f',       'h'), 
    ('r',       'l'), 
    ('k',       'g'),
]

auto_correction_pairs = {
    'ign':      'ing',
    'img':      'ing',
    'uei':      'ui',
    'uen':      'un',
    'iou':      'iu',
}

valid_init_fin_pairs = [
    ([""],      ["a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "er", "o", "ou", "ng"]),
    (["b"],     ["", "a", "ai", "an", "ang", "ao", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "o", "u"]),
    (["p"],     ["", "a", "ai", "an", "ang", "ao", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "o", "ou", "u"]),
    (["m"],     ["", "a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ian", "iao", "ie", "in", "ing", "iu", "o", "ou", "u"]),
    (["f"],     ["", "a", "an", "ang", "ei", "en", "eng", "iao", "o", "ou", "u"]),
    (["d"],     ["", "a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ia", "ian", "iao", "ie", "ing", "iu", "ong", "ou", "u", "uan", "ui", "un", "uo"]),
    (["t"],     ["", "a", "ai", "an", "ang", "ao", "e", "ei", "eng", "i", "ian", "iao", "ie", "ing", "ong", "ou", "u", "uan", "ui", "un", "uo"]),
    (["n"],     ["", "a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ian", "iang", "iao", "ie", "in", "ing", "iu", "ong", "ou", "u", "uan", "un", "uo", "v", "ve|ue"]),
    (["l"],     ["", "a", "ai", "an", "ang", "ao", "e", "ei", "eng", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iu", "o", "ong", "ou", "u", "uan", "un", "uo", "v", "ve|ue"]),
    (["g", "k", "h"],   ["", "a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "ong", "ou", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo"]),
    (["j", "q", "x"],    ["", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "u", "uan", "ue", "un"]),
    (["zh"],    ["", "a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ong", "ou", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo"]),
    (["ch"],    ["", "a", "ai", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo"]),
    (["sh"],    ["", "a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ou", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo"]),
    (["r"],     ["", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "ua", "uan", "ui", "un", "uo"]),
    (["z"],     ["", "a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un", "uo"]),
    (["c"],     ["", "a", "ai", "an", "ang", "ao", "e", "ei", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un", "uo"]),
    (["s"],     ["", "a", "ai", "an", "ang", "ao", "e", "en", "eng", "i", "ong", "ou", "u", "uan", "ui", "un", "uo"]),
    (["y"],     ["", "a", "an", "ang", "ao", "e", "i", "in", "ing", "o", "ong", "ou", "u", "uan", "ue", "un"]),
    (["w"],     ["", "a", "ai", "an", "ang", "ei", "en", "eng", "o", "u"]),
    (["hm"],    [""]),
]

valid_syllables = {}

for (ii, ff) in valid_init_fin_pairs:
    for i in ii:
        for f in ff:
            fv = f.split('|')
            for fi in fv:
                valid_syllables[i+fi] = (initials.index(i) << 12) + (finals.index(fv[0]) << 4)

fuzzy_map = {}
for i, j in fuzzy_pairs:
    fuzzy_map.setdefault (i, []).append(j)
    fuzzy_map.setdefault (j, []).append(i)

fuzzy_pro_syllables = [s for s in valid_syllables if s[1:] in valid_syllables and s[0] in initials and s not in initials]
fuzzy_pre_syllables = [s for s in valid_syllables if s[:-1] in valid_syllables and s[-1] in initials and s not in initials]
initial_sets = {s[0] for s in fuzzy_pro_syllables} & {s[-1] for s in fuzzy_pre_syllables}
fuzzy_pro_syllables  = [s for s in fuzzy_pro_syllables if s[0] in initial_sets]
fuzzy_pre_syllables  = [s for s in fuzzy_pre_syllables if s[-1] in initial_sets]

def decode_syllable (s):
    return initials[(s >> 12)], finals[(s & 0x00ff0) >> 4]

def get_fuzzy_syllables (syllable):
    i, f = decode_syllable (syllable)
    ii = fuzzy_map.setdefault (i, []) + [i]
    ff = fuzzy_map.setdefault (f, []) + [f]
    sset = [valid_syllables[init + fin] for i in ii for f in ff if i + f in valid_syllables]
    sset.remove (syllable)
    return sset

# -*- indent-tabs-mode: nil -*- vim:et:ts=4