# load_unicode.py # Load unicode data into a sqlite database. # # cp codepoint # name name # cat general_category # combining combining_category # bidi import re import sqlite3 class UnicodeDatabase(object): property_regex = re.compile( r'^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*(\S+)\s*[#;]') def __init__(self, filename): self._conn = sqlite3.connect(filename) def load(self): """ Load database files. """ self.create_tables() self.parse_unicodedata('UnicodeData.txt') self.add_non_characters() self.parse_properties('DerivedAge.txt', 'age') self.parse_properties('Scripts.txt', 'script') self.parse_properties('HangulSyllableType.txt', 'hst') self.add_reserved_dicp() self.parse_property_present('DerivedCoreProperties.txt', 'dicp', 'Default_Ignorable_Code_Point') self.parse_property_present('DerivedNormalizationProps.txt', 'fce', 'Full_Composition_Exclusion') self.add_exceptions() self.assign_has_compat() self.assign_precis() self._conn.commit() def create_tables(self): """ Create database tables. """ cur = self._conn.cursor() ddl = ''' CREATE TABLE codepoints ( cp INTEGER PRIMARY KEY, name TEXT NOT NULL, category TEXT NOT NULL, combining INTEGER NOT NULL, bidi TEXT NOT NULL, decomp TEXT NOT NULL, first_cp INTEGER NOT NULL, -- first cp in canonical decomp, -1 means no decomp, -2 means age REAL, script TEXT, hst TEXT, -- HangulSyllableType dicp INT, -- Default_Ignorable_Code_Point fce INT, -- Full_Composition_Exclusion has_compat INT, precis TEXT ) ''' cur.execute(ddl) def parse_unicodedata(self, filename): """ Load data from UnicodeData.txt file. """ cur = self._conn.cursor() for line in open(filename): cols = line.split(';') cp = int(cols[0], 16) name = cols[1] if name.endswith(', First>'): first = cp elif name.endswith(', Last>'): name = name[:-7] + ' %4.4x-%4.4x>' % (first, cp) for n in range(first, cp + 1): self._insert(cur, n, name, cols[2], cols[3], cols[4], cols[5]) else: self._insert(cur, cp, name, cols[2], cols[3], cols[4], cols[5]) def add_non_characters(self): """ Add entries for non-characters. """ cur = self._conn.cursor() for cp in range(0xfdd0, 0xfdef + 1): self._insert(cur, cp, '', 'Cn', 0, '', '') for n in range(0, 17): cp1 = (n << 16) | 0xfffe cp2 = (n << 16) | 0xffff self._insert(cur, cp1, '', 'Cn', 0, '', '') self._insert(cur, cp2, '', 'Cn', 0, '', '') def add_reserved_dicp(self): """ Add entries for chars that have the 'dicp' property. """ cur = self._conn.cursor() self._insert(cur, 0x2065, '', 'Cn', 0, '', '') for cp in range(0xfff0, 0xfff8 + 1): self._insert(cur, cp, '', 'Cn', 0, '', '') self._insert(cur, 0xe0000, '', 'Cn', 0, '', '') for cp in range(0xe0002, 0xe001f + 1): self._insert(cur, cp, '', 'Cn', 0, '', '') for cp in range(0xe0080, 0xe00ff + 1): self._insert(cur, cp, '', 'Cn', 0, '', '') for cp in range(0xe01f0, 0xe0fff + 1): self._insert(cur, cp, '', 'Cn', 0, '', '') def parse_properties(self, filename, column): """ Load data from a Unicode property file. """ cur = self._conn.cursor() for line in open(filename): line = line[:-1] if not line or line[0] == '#': continue m = self.property_regex.match(line) if not m: raise ValueError('Parse failed: %s' % line) self._set_column(cur, column, m.group(1), m.group(2), m.group(3)) def parse_property_present(self, filename, column, value): """ Load data from a Unicode property file. Set `column` to 1 if we find `value`. """ cur = self._conn.cursor() for line in open(filename): line = line[:-1] if not line or line[0] == '#': continue m = self.property_regex.match(line) if not m: raise ValueError('Parse failed: %s' % line) if m.group(3) == value: self._set_column(cur, column, m.group(1), m.group(2), 1) def _insert(self, cur, cp, name, category, combining, bidi, decomp): """ Insert a codepoint into table. """ # Set `first_cp` depending on the value of the `decomp` field. If # `decomp` is empty, set first_cp to -1. If `decomp` is a compatibility # decomposition (starts with '<'), set first_cp to -2. Otherwise, set # first_cp to the character code of the first codepoint in `decomp`. if not decomp: first_cp = -1 elif decomp.startswith('<'): first_cp = -2 else: first_cp = int(decomp.split()[0], 16) cur.execute( 'INSERT INTO codepoints (cp, name, category, combining, bidi, decomp, first_cp) VALUES (?, ?, ?, ?, ?, ?, ?)', (cp, name, category, int(combining), bidi, decomp, first_cp)) def _set_column(self, cur, column, first, last, value): """ Set a specific column in the """ first = int(first, 16) last = int(last, 16) if last else first sql = 'UPDATE codepoints SET %s=? WHERE cp=? AND %s IS NULL' % (column, column) for cp in range(first, last + 1): cur.execute(sql, (value, cp)) if cur.rowcount != 1: print('failed update: %4.4x %s=%s [%04x-%04x]' % (cp, column, value, first, last)) def add_exceptions(self): """ Add PRECIS exceptions. """ sql = ''' UPDATE codepoints SET precis = 'PVALID/exceptions' WHERE cp in (0x00DF, 0x03C2, 0x06FD, 0x06FE, 0x0F0B, 0x3007); UPDATE codepoints SET precis = 'CONTEXTO/exceptions' WHERE cp in (0x00B7, 0x0375, 0x05F3, 0x05F4, 0x30FB, 0x0660, 0x0661, 0x0662, 0x0663, 0x0664, 0x0665, 0x0666, 0x0667, 0x0668, 0x0669, 0x06F0, 0x06F1, 0x06F2, 0x06F3, 0x06F4, 0x06F5, 0x06F6, 0x06F7, 0x06F8, 0x06F9); UPDATE codepoints SET precis = 'DISALLOWED/exceptions' WHERE cp in ( 0x0640, 0x07FA, 0x302E, 0x302F, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x303B); ''' cur = self._conn.cursor() cur.executescript(sql) def assign_precis(self): """ Assign precis derived property value to each codepoint. This is called after exceptions and backward_compatible have been assigned their precis properties. """ sql = ''' UPDATE codepoints SET precis = ( CASE -- unassigned WHEN category = 'Cn' AND name != '' THEN 'UNASSIGNED/unassigned' -- ascii7 WHEN cp BETWEEN 0x21 AND 0x7E THEN 'PVALID/ascii7' -- join_control WHEN cp BETWEEN 0x200c AND 0x200d THEN 'CONTEXTJ/join_control' -- old_hangul_jamo WHEN hst IN ('L', 'V', 'T') THEN 'DISALLOWED/old_hangul_jamo' -- precis_ignorable_properties WHEN dicp == 1 OR name == '' THEN 'DISALLOWED/precis_ignorable_properties' -- controls WHEN category = 'Cc' THEN 'DISALLOWED/controls' -- has_compat WHEN has_compat = 1 THEN 'FREE_PVAL/has_compat' -- letter_digits WHEN category IN ('Ll', 'Lu', 'Lo', 'Nd', 'Lm', 'Mn', 'Mc') THEN 'PVALID/letter_digits' -- other_letter_digits WHEN category IN ('Lt', 'Nl', 'No', 'Me') THEN 'FREE_PVAL/other_letter_digits' -- spaces WHEN category = 'Zs' THEN 'FREE_PVAL/spaces' -- symbols WHEN category IN ('Sm', 'Sc', 'Sk', 'So') THEN 'FREE_PVAL/symbols' -- punctuation WHEN category IN ('Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po') THEN 'FREE_PVAL/punctuation' -- other ELSE 'DISALLOWED/other' END ) WHERE precis IS NULL ''' cur = self._conn.cursor() cur.execute(sql) def assign_has_compat(self): """ Assign true to characters that have compatibility decompositions. For these, `normalize('NFKC', ch) != ch`. """ cur = self._conn.cursor() # Set has_compat=1 for characters whose decomp field begins with '<' or # has a 'full composition exclusion' of 1. sql = ''' UPDATE codepoints SET has_compat=1 WHERE decomp LIKE '<%' OR fce = 1 ''' cur.execute(sql) # The set of codepoints with compatibility decompositions is not complete # until we include the set of approximately 15 chars whose CANONICAL # decomposition has a further COMPATIBILITY decomposition. sql = ''' UPDATE codepoints SET has_compat=1 WHERE cp IN ( SELECT a.cp from codepoints a, codepoints b WHERE b.cp = a.first_cp AND b.first_cp == -2 ) ''' cur.execute(sql) def check_has_compat(self, ucd): """ Check that has_compat is set to 1 for every character where normalize(NFKC, ch) != ch. """ cur = self._conn.cursor() sql = 'SELECT cp, has_compat, age FROM codepoints WHERE age <= %g' % ucd.version for cp, has_compat, age in cur.execute(sql): char = chr(cp) norm = ucd.normalize('NFKC', char) if has_compat == 1: if norm == char: print('Invalid has_compat=1 for cp=%d, age=%s' % (cp, age)) else: if norm != char: print('Invalid has_compat=0 for cp=%d, age=%s' % (cp, age)) def check_precis(self, ucd): """ Compare derived property computation to `precis` value in database. """ from precis_i18n.derived import derived_property cur = self._conn.cursor() sql = 'SELECT cp, precis, age FROM codepoints WHERE age <= %g' % UCD.version for cp, precis, age in cur.execute(sql): prop = '%s/%s' % derived_property(cp, ucd) if prop != precis: print('Different precis value: %s vs %s for cp=%d, age=%s' % (prop, precis, cp, age)) if __name__ == '__main__': try: db = UnicodeDatabase('unicode.db') db.load() except sqlite3.OperationalError: pass from precis_i18n.unicode import UnicodeData UCD = UnicodeData() db.check_has_compat(UCD) db.check_precis(UCD)