File: xml_names.py

package info (click to toggle)

python-feedvalidator 0~svn1022-2

links: PTS, VCS
area: main
in suites: jessie, jessie-kfreebsd, squeeze, wheezy
size: 652 kB
ctags: 2,452
sloc: python: 9,481; makefile: 27; sh: 8

file content (77 lines) | stat: -rwxr-xr-x 2,784 bytes

parent folder | download | duplicates (2)

# From: http://www.w3.org/TR/REC-xml#NT-CombiningChar
#
# * Name start characters must have one of the categories Ll, Lu, Lo,
#   Lt, Nl.
#
# * Name characters other than Name-start characters must have one of
#   the categories Mc, Me, Mn, Lm, or Nd.
#
# * Characters in the compatibility area (i.e. with character code
#   greater than #xF900 and less than #xFFFE) are not allowed in XML
#   names.
#
# * Characters which have a font or compatibility decomposition
#   (i.e. those with a "compatibility formatting tag" in field 5 of the
#   database -- marked by field 5 beginning with a "<") are not allowed.
#
# * The following characters are treated as name-start characters rather
#   than name characters, because the property file classifies them as
#   Alphabetic: [#x02BB-#x02C1], #x0559, #x06E5, #x06E6.
#
# * Characters #x20DD-#x20E0 are excluded (in accordance with Unicode
#   2.0, section 5.14).
#
# * Character #x00B7 is classified as an extender, because the property
#   list so identifies it.
#
# * Character #x0387 is added as a name character, because #x00B7 is its
#   canonical equivalent.
#
# * Characters ':' and '_' are allowed as name-start characters.
#
# * Characters '-' and '.' are allowed as name characters.

from unicodedata import category, decomposition

NAME_START_CATEGORIES = ["Ll", "Lu", "Lo", "Lt", "Nl"]
NAME_CATEGORIES = NAME_START_CATEGORIES + ["Mc", "Me", "Mn", "Lm", "Nd"]
ALLOWED_NAME_CHARS = [u"\u00B7", u"\u0387", u"-", u".", u"_"]

# http://www.w3.org/TR/REC-xml-names/#NT-NCName
#  [4] NCName ::= (Letter | '_') (NCNameChar)* /* An XML Name, minus
#      the ":" */
#  [5] NCNameChar ::= Letter | Digit | '.' | '-' | '_' | CombiningChar
#      | Extender

def is_ncname(name):
    first = name[0]
    if first=="_" or category(first) in NAME_START_CATEGORIES:
        for i in xrange(1, len(name)):
            c = name[i]
            if not category(c) in NAME_CATEGORIES:
                if c in ALLOWED_NAME_CHARS:
                    continue
                return 0
            #if in compatibility area
            #if decomposition(c)!='':
            #    return 0
            
        return 1
    else:
        return 0
                
def split_uri(predicate):
    predicate = predicate
    length = len(predicate)
    for i in xrange(0, length):
        if not category(predicate[-i-1]) in NAME_CATEGORIES:
            for j in xrange(-1-i, length):
                if category(predicate[j]) in NAME_START_CATEGORIES:
                    ns = predicate[:j]
                    if not ns:
                        break
                    ln = predicate[j:]
                    return (ns, ln)
            break
    raise Error("This graph cannot be serialized in RDF/XML. Could not split predicate: '%s'" % predicate)