1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
|
# load_unicode.py
# Load unicode data into a sqlite database.
#
# cp codepoint
# name name
# cat general_category
# combining combining_category
# bidi
import re
import sqlite3
class UnicodeDatabase(object):
property_regex = re.compile(r"^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\S+)\s*[#;]")
def __init__(self, filename):
self._conn = sqlite3.connect(filename)
def load(self):
"""Load database files."""
self.create_tables()
self.parse_unicodedata("UnicodeData.txt")
self.add_non_characters()
self.parse_properties("DerivedAge.txt", "age")
self.parse_properties("Scripts.txt", "script")
self.parse_properties("HangulSyllableType.txt", "hst")
self.add_reserved_dicp()
self.parse_property_present(
"DerivedCoreProperties.txt", "dicp", "Default_Ignorable_Code_Point"
)
self.parse_property_present(
"DerivedNormalizationProps.txt", "fce", "Full_Composition_Exclusion"
)
self.add_exceptions()
self.assign_has_compat()
self.assign_precis()
self._conn.commit()
def create_tables(self):
"""Create database tables."""
cur = self._conn.cursor()
ddl = """
CREATE TABLE codepoints (
cp INTEGER PRIMARY KEY,
name TEXT NOT NULL,
category TEXT NOT NULL,
combining INTEGER NOT NULL,
bidi TEXT NOT NULL,
decomp TEXT NOT NULL,
first_cp INTEGER NOT NULL, -- first cp in canonical decomp, -1 means no decomp, -2 means
age REAL,
script TEXT,
hst TEXT, -- HangulSyllableType
dicp INT, -- Default_Ignorable_Code_Point
fce INT, -- Full_Composition_Exclusion
has_compat INT,
precis TEXT
)
"""
cur.execute(ddl)
def parse_unicodedata(self, filename):
"""Load data from UnicodeData.txt file."""
cur = self._conn.cursor()
for line in open(filename):
cols = line.split(";")
cp = int(cols[0], 16)
name = cols[1]
if name.endswith(", First>"):
first = cp
elif name.endswith(", Last>"):
name = name[:-7] + " %4.4x-%4.4x>" % (first, cp)
for n in range(first, cp + 1):
self._insert(cur, n, name, cols[2], cols[3], cols[4], cols[5])
else:
self._insert(cur, cp, name, cols[2], cols[3], cols[4], cols[5])
def add_non_characters(self):
"""Add entries for non-characters."""
cur = self._conn.cursor()
for cp in range(0xFDD0, 0xFDEF + 1):
self._insert(cur, cp, "<noncharacter>", "Cn", 0, "", "")
for n in range(0, 17):
cp1 = (n << 16) | 0xFFFE
cp2 = (n << 16) | 0xFFFF
self._insert(cur, cp1, "<noncharacter>", "Cn", 0, "", "")
self._insert(cur, cp2, "<noncharacter>", "Cn", 0, "", "")
def add_reserved_dicp(self):
"""Add entries for <reserved> chars that have the 'dicp' property."""
cur = self._conn.cursor()
self._insert(cur, 0x2065, "<reserved>", "Cn", 0, "", "")
for cp in range(0xFFF0, 0xFFF8 + 1):
self._insert(cur, cp, "<reserved>", "Cn", 0, "", "")
self._insert(cur, 0xE0000, "<reserved>", "Cn", 0, "", "")
for cp in range(0xE0002, 0xE001F + 1):
self._insert(cur, cp, "<reserved>", "Cn", 0, "", "")
for cp in range(0xE0080, 0xE00FF + 1):
self._insert(cur, cp, "<reserved>", "Cn", 0, "", "")
for cp in range(0xE01F0, 0xE0FFF + 1):
self._insert(cur, cp, "<reserved>", "Cn", 0, "", "")
def parse_properties(self, filename, column):
"""Load data from a Unicode property file."""
cur = self._conn.cursor()
for line in open(filename):
line = line[:-1]
if not line or line[0] == "#":
continue
m = self.property_regex.match(line)
if not m:
raise ValueError("Parse failed: %s" % line)
self._set_column(cur, column, m.group(1), m.group(2), m.group(3))
def parse_property_present(self, filename, column, value):
"""Load data from a Unicode property file. Set `column` to 1 if we
find `value`.
"""
cur = self._conn.cursor()
for line in open(filename):
line = line[:-1]
if not line or line[0] == "#":
continue
m = self.property_regex.match(line)
if not m:
raise ValueError("Parse failed: %s" % line)
if m.group(3) == value:
self._set_column(cur, column, m.group(1), m.group(2), 1)
def _insert(self, cur, cp, name, category, combining, bidi, decomp):
"""Insert a codepoint into table."""
# Set `first_cp` depending on the value of the `decomp` field. If
# `decomp` is empty, set first_cp to -1. If `decomp` is a compatibility
# decomposition (starts with '<'), set first_cp to -2. Otherwise, set
# first_cp to the character code of the first codepoint in `decomp`.
if not decomp:
first_cp = -1
elif decomp.startswith("<"):
first_cp = -2
else:
first_cp = int(decomp.split()[0], 16)
cur.execute(
"INSERT INTO codepoints (cp, name, category, combining, bidi, decomp, first_cp) VALUES (?, ?, ?, ?, ?, ?, ?)",
(cp, name, category, int(combining), bidi, decomp, first_cp),
)
def _set_column(self, cur, column, first, last, value):
"""Set a specific column in the"""
first = int(first, 16)
last = int(last, 16) if last else first
sql = "UPDATE codepoints SET %s=? WHERE cp=? AND %s IS NULL" % (column, column)
for cp in range(first, last + 1):
cur.execute(sql, (value, cp))
if cur.rowcount != 1:
print(
"failed update: %4.4x %s=%s [%04x-%04x]"
% (cp, column, value, first, last)
)
def add_exceptions(self):
"""Add PRECIS exceptions."""
sql = """
UPDATE codepoints SET precis = 'PVALID/exceptions' WHERE cp in (0x00DF,
0x03C2, 0x06FD, 0x06FE, 0x0F0B, 0x3007);
UPDATE codepoints SET precis = 'CONTEXTO/exceptions' WHERE cp in (0x00B7, 0x0375,
0x05F3, 0x05F4, 0x30FB, 0x0660, 0x0661, 0x0662, 0x0663, 0x0664,
0x0665, 0x0666, 0x0667, 0x0668, 0x0669, 0x06F0, 0x06F1, 0x06F2,
0x06F3, 0x06F4, 0x06F5, 0x06F6, 0x06F7, 0x06F8, 0x06F9);
UPDATE codepoints SET precis = 'DISALLOWED/exceptions' WHERE cp in (
0x0640, 0x07FA, 0x302E, 0x302F, 0x3031, 0x3032, 0x3033, 0x3034,
0x3035, 0x303B);
"""
cur = self._conn.cursor()
cur.executescript(sql)
def assign_precis(self):
"""Assign precis derived property value to each codepoint.
This is called after exceptions and backward_compatible have been
assigned their precis properties.
"""
sql = """
UPDATE codepoints SET precis = (
CASE
-- unassigned
WHEN category = 'Cn' AND name != '<noncharacter>' THEN 'UNASSIGNED/unassigned'
-- ascii7
WHEN cp BETWEEN 0x21 AND 0x7E THEN 'PVALID/ascii7'
-- join_control
WHEN cp BETWEEN 0x200c AND 0x200d THEN 'CONTEXTJ/join_control'
-- old_hangul_jamo
WHEN hst IN ('L', 'V', 'T') THEN 'DISALLOWED/old_hangul_jamo'
-- precis_ignorable_properties
WHEN dicp == 1 OR name == '<noncharacter>' THEN 'DISALLOWED/precis_ignorable_properties'
-- controls
WHEN category = 'Cc' THEN 'DISALLOWED/controls'
-- has_compat
WHEN has_compat = 1 THEN 'FREE_PVAL/has_compat'
-- letter_digits
WHEN category IN ('Ll', 'Lu', 'Lo', 'Nd', 'Lm', 'Mn', 'Mc') THEN 'PVALID/letter_digits'
-- other_letter_digits
WHEN category IN ('Lt', 'Nl', 'No', 'Me') THEN 'FREE_PVAL/other_letter_digits'
-- spaces
WHEN category = 'Zs' THEN 'FREE_PVAL/spaces'
-- symbols
WHEN category IN ('Sm', 'Sc', 'Sk', 'So') THEN 'FREE_PVAL/symbols'
-- punctuation
WHEN category IN ('Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po') THEN 'FREE_PVAL/punctuation'
-- other
ELSE 'DISALLOWED/other'
END
) WHERE precis IS NULL
"""
cur = self._conn.cursor()
cur.execute(sql)
def assign_has_compat(self):
"""Assign true to characters that have compatibility decompositions.
For these, `normalize('NFKC', ch) != ch`.
"""
cur = self._conn.cursor()
# Set has_compat=1 for characters whose decomp field begins with '<' or
# has a 'full composition exclusion' of 1.
sql = """
UPDATE codepoints SET has_compat=1 WHERE decomp LIKE '<%' OR fce = 1
"""
cur.execute(sql)
# The set of codepoints with compatibility decompositions is not complete
# until we include the set of approximately 15 chars whose CANONICAL
# decomposition has a further COMPATIBILITY decomposition.
sql = """
UPDATE codepoints SET has_compat=1 WHERE cp IN (
SELECT a.cp from codepoints a, codepoints b WHERE b.cp = a.first_cp AND b.first_cp == -2
)
"""
cur.execute(sql)
def check_has_compat(self, ucd):
"""Check that has_compat is set to 1 for every character where
normalize(NFKC, ch) != ch.
"""
cur = self._conn.cursor()
sql = "SELECT cp, has_compat, age FROM codepoints WHERE age <= %g" % ucd.version
for cp, has_compat, age in cur.execute(sql):
char = chr(cp)
norm = ucd.normalize("NFKC", char)
if has_compat == 1:
if norm == char:
print("Invalid has_compat=1 for cp=%d, age=%s" % (cp, age))
else:
if norm != char:
print("Invalid has_compat=0 for cp=%d, age=%s" % (cp, age))
def check_precis(self, ucd):
"""Compare derived property computation to `precis` value in database."""
from precis_i18n.derived import derived_property
cur = self._conn.cursor()
sql = "SELECT cp, precis, age FROM codepoints WHERE age <= %g" % UCD.version
for cp, precis, age in cur.execute(sql):
prop = "%s/%s" % derived_property(cp, ucd)
if prop != precis:
print(
"Different precis value: %s vs %s for cp=%d, age=%s"
% (prop, precis, cp, age)
)
if __name__ == "__main__":
try:
db = UnicodeDatabase("unicode.db")
db.load()
except sqlite3.OperationalError:
pass
from precis_i18n.unicode import UnicodeData
UCD = UnicodeData()
db.check_has_compat(UCD)
db.check_precis(UCD)
|