1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
# From: http://www.w3.org/TR/REC-xml#NT-CombiningChar
#
# * Name start characters must have one of the categories Ll, Lu, Lo,
# Lt, Nl.
#
# * Name characters other than Name-start characters must have one of
# the categories Mc, Me, Mn, Lm, or Nd.
#
# * Characters in the compatibility area (i.e. with character code
# greater than #xF900 and less than #xFFFE) are not allowed in XML
# names.
#
# * Characters which have a font or compatibility decomposition
# (i.e. those with a "compatibility formatting tag" in field 5 of the
# database -- marked by field 5 beginning with a "<") are not allowed.
#
# * The following characters are treated as name-start characters rather
# than name characters, because the property file classifies them as
# Alphabetic: [#x02BB-#x02C1], #x0559, #x06E5, #x06E6.
#
# * Characters #x20DD-#x20E0 are excluded (in accordance with Unicode
# 2.0, section 5.14).
#
# * Character #x00B7 is classified as an extender, because the property
# list so identifies it.
#
# * Character #x0387 is added as a name character, because #x00B7 is its
# canonical equivalent.
#
# * Characters ':' and '_' are allowed as name-start characters.
#
# * Characters '-' and '.' are allowed as name characters.
from unicodedata import category, decomposition
NAME_START_CATEGORIES = ["Ll", "Lu", "Lo", "Lt", "Nl"]
NAME_CATEGORIES = NAME_START_CATEGORIES + ["Mc", "Me", "Mn", "Lm", "Nd"]
ALLOWED_NAME_CHARS = [u"\u00B7", u"\u0387", u"-", u".", u"_"]
# http://www.w3.org/TR/REC-xml-names/#NT-NCName
# [4] NCName ::= (Letter | '_') (NCNameChar)* /* An XML Name, minus
# the ":" */
# [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_' | CombiningChar
# | Extender
def is_ncname(name):
first = name[0]
if first=="_" or category(first) in NAME_START_CATEGORIES:
for i in xrange(1, len(name)):
c = name[i]
if not category(c) in NAME_CATEGORIES:
if c in ALLOWED_NAME_CHARS:
continue
return 0
#if in compatibility area
#if decomposition(c)!='':
# return 0
return 1
else:
return 0
def split_uri(predicate):
predicate = predicate
length = len(predicate)
for i in xrange(0, length):
if not category(predicate[-i-1]) in NAME_CATEGORIES:
for j in xrange(-1-i, length):
if category(predicate[j]) in NAME_START_CATEGORIES:
ns = predicate[:j]
if not ns:
break
ln = predicate[j:]
return (ns, ln)
break
raise Error("This graph cannot be serialized in RDF/XML. Could not split predicate: '%s'" % predicate)
|