1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
# libuninameslist
#
# Copyright (C) 2017, Shriramana Sharma
#
# This Python wrapper is subject to the same "BSD 3-clause"-type license
# which the wrapped C library is subject to.
'''
provides access to Unicode character names, annotations and block data
This Python wrapper implements:
1) three functions:
name, annotation, block
2) one generator:
blocks
3) one variable:
version
4) two convenience functions:
name2, uplus
`version` points to the internal version string of the library.
Run help() on the rest of the symbols for more info.
'''
__all__ = ["name", "annotation", "block",
"blocks", "version",
"name2", "uplus"]
# connecting to the dynamic library
from ctypes import *
from ctypes.util import find_library
_lib = CDLL(find_library("uninameslist"))
def _setSig(fn, restype, argtypes):
if restype is not None: fn.restype = restype
fn.argtypes = argtypes
# const char *uniNamesList_NamesListVersion(void);
_setSig(_lib.uniNamesList_NamesListVersion, c_char_p, [])
# const char *uniNamesList_name(unsigned long uni);
_setSig(_lib.uniNamesList_name, c_char_p, [c_ulong])
# const char *uniNamesList_annot(unsigned long uni);
_setSig(_lib.uniNamesList_annot, c_char_p, [c_ulong])
# int uniNamesList_blockCount(void);
_setSig(_lib.uniNamesList_blockCount, c_int, [])
# int uniNamesList_blockNumber(unsigned long uni);
_setSig(_lib.uniNamesList_blockNumber, c_int, [c_ulong])
# long uniNamesList_blockStart(int uniBlock);
_setSig(_lib.uniNamesList_blockStart, c_long, [c_int])
# long uniNamesList_blockEnd(int uniBlock);
_setSig(_lib.uniNamesList_blockEnd, c_long, [c_int])
# const char *uniNamesList_blockName(int uniBlock);
_setSig(_lib.uniNamesList_blockName, c_char_p, [c_int])
# int uniNamesList_names2cnt(void);
_setSig(_lib.uniNamesList_names2cnt, c_int, [])
# long uniNamesList_names2val(int count);
_setSig(_lib.uniNamesList_names2val, c_long, [c_int])
# int uniNamesList_names2getU(unsigned long uni);
_setSig(_lib.uniNamesList_names2getU, c_int, [c_ulong])
# int uniNamesList_names2lnC(int count);
_setSig(_lib.uniNamesList_names2lnC, c_int, [c_int])
# const char *uniNamesList_names2anC(int count);
_setSig(_lib.uniNamesList_names2anC, c_char_p, [c_int])
# internal helpers
class _block:
'''Simple class containing the name, start and end codepoints of a Unicode block'''
__slots__ = ["name", "start", "end"]
def __init__(self, name, start, end):
self.name = name; self.start = start; self.end = end
def __str__(self):
return "<‘{}’: {} - {}>".format(self.name, uplus(self.start), uplus(self.end))
@staticmethod
def _fromNum(num):
return _block(_lib.uniNamesList_blockName(num).decode(),
_lib.uniNamesList_blockStart(num),
_lib.uniNamesList_blockEnd(num))
_blockCount = _lib.uniNamesList_blockCount()
# public symbols
'''documents the version of libuninameslist'''
version = _lib.uniNamesList_NamesListVersion().decode()
def name(char):
'''returns the Unicode character name'''
name = _lib.uniNamesList_name(ord(char))
return "" if name is None else name.decode()
def name2(char):
'''returns the normative alias if defined for correcting a Unicode character name, else just the name'''
name2Index = _lib.uniNamesList_names2getU(ord(char))
if name2Index < 0: # no normative alias
return name(char)
annotationBytes = _lib.uniNamesList_names2anC(name2Index)
normativeAliasLength = _lib.uniNamesList_names2lnC(name2Index)
return annotationBytes[:normativeAliasLength].decode()
def annotation(char):
'''returns all Unicode annotations including aliases and cross-references as provided by NamesList.txt'''
annot = _lib.uniNamesList_annot(ord(char))
return "" if annot is None else annot.decode()
def blocks():
'''a generator for iterating through all defined Unicode blocks'''
for blockNum in range(_blockCount):
yield _block._fromNum(blockNum)
def block(char):
'''returns the Unicode block a character is in'''
return _block._fromNum(_lib.uniNamesList_blockNumber(ord(char)))
# apart from what C library provides
charactersWithName2 = "".join(chr(_lib.uniNamesList_names2val(i)) for i in range(_lib.uniNamesList_names2cnt()))
def uplus(char):
'''convenience function to return the Unicode codepoint for a character in the format U+XXXX for BMP and U+XXXXXX beyond that'''
if type(char) is int:
if not (0 <= char <= 0x10FFFF):
raise ValueError("Invalid Unicode codepoint: U+{:X}".format(char))
val = char
else:
val = ord(char)
return ("U+{:06X}" if val > 0xFFFF else "U+{:04X}").format(val)
# test
def _test():
print("Using libuninameslist version:\n\t", version)
print("The Unicode name of ೞ is:\n\t", name("ೞ"))
print("The Unicode annotation of ೞ is:")
print(annotation("ೞ"))
print("The Unicode name (with corrections) of ೞ is:\n\t", name2("ೞ"))
print("The Unicode block of ೞ is:\n\t", block("ೞ"))
print("The Unicode codepoint of ೞ is:\n\t", uplus("ೞ"))
print()
print("A complete list of blocks:")
print("\n".join(str(block) for block in blocks()))
if __name__ == "__main__":
_test()
|