File: charconv.py

package info (click to toggle)
python-xml 0.4.19981014-1
links: PTS
area: main
in suites: slink
size: 2,124 kB
ctags: 3,099
sloc: ansic: 9,075; python: 8,150; xml: 7,940; makefile: 84; sh: 41
file content (179 lines) | stat: -rw-r--r-- 5,931 bytes

# Some experiments in adding character encoding conversions to xmlproc.
# This module is not yet used by the released xmlproc, since I'm awaiting
# a reorganization.

import string

# --- Conversion tables

# CP 850 to ISO 8859-1

# First element is no 128, second 129 ...
# The non-ISO characters, such as <empty set>, are mapped to non-ISO chars
# 127-145 and 147-159 in the order they appear in CP 850. Since there are
# more non-ISO chars than there is room for in these intervals, some of
# the last chars are also mapped to 159.
    
cp850_iso=[199,252,233,226,228,224,229,231,234,235,232,239,238,236,196,197,
           201,230,198,244,246,242,251,249,255,246,220,248,163,127,215,128,
           225,237,243,250,241,209,170,186,191,174,172,189,188,161,171,187,
           129,130,131,132,133,193,194,192,169,134,135,136,137,162,165,138,
           139,140,141,142,143,144,227,195,145,147,148,149,150,151,152,164,
           240,208,202,203,200,153,205,206,207,154,155,156,157,166,204,158,
           211,223,212,210,245,213,181,222,254,218,219,217,253,221,175,180,
           173,177,159,190,182,167,247,184,176,168,159,185,179,178,159,160]

cp850_iso_tbl=""
for ix in range(128):
    cp850_iso_tbl=cp850_iso_tbl+chr(ix)
for chno in cp850_iso:
    cp850_iso_tbl=cp850_iso_tbl+chr(chno)

# ISO 8859-1 to CP 850
    
iso_cp850=[0]*256
for ix in range(256):
    iso_cp850[ord(cp850_iso_tbl[ix])]=ix

iso_cp850_tbl=""
for chno in iso_cp850:
    iso_cp850_tbl=iso_cp850_tbl+chr(chno)

# --- Conversion functions

def utf8_to_iso8859(data):
    out=""

    ix=0
    for ix in range(len(data)):
        chn=ord(data[ix])
        if chn & 224==192: # 110xxxxx
#             print "%d %d -> %d" % (chn, ord(data[ix+1]),
#                                    ((chn & 3) << 6) + (ord(data[ix+1]) & 63))
            out=out+chr( ((chn & 3) << 6) + (ord(data[ix+1]) & 63))
        elif chn & 128==0: # 0xxxxxxx
            out=out+data[ix]

    return out

def iso8859_to_utf8(data):
    out=""

    for ch in data:
        if ord(ch)<128:
            out=out+ch
        else:
            chno=ord(ch)
            out=out+chr(192+((chno & 192)>>6))+chr(128+(chno & 63))

    return out

def cp850_to_iso8859(data):
    return string.translate(data,cp850_iso_tbl)

def iso8859_to_cp850(data):
    return string.translate(data,iso_cp850_tbl)

def id_conv(data):
    return data

def cp850_to_utf8(data):
    return iso8859_to_utf8(cp850_to_iso8859(data))

def utf8_to_cp850(data):
    return iso8859_to_cp850(utf8_to_iso8859(data))

# --- Conversion function database

class ConverterDatabase:
    """This class knows about all registered converting functions, and can be
    queried for information about converters."""

    def __init__(self):
        self.__map={}
        self.__alias_map={}

    def add_alias(self,canonical,alias):
        "Adds an alias for a character set."
        self.__alias_map[string.lower(alias)]=string.lower(canonical)
        
    def can_convert(self,from_encoding,to_encoding):
        """Returns true if converters to from from_encoding to to_encoding are
        known. Encoding names follow the syntax specified by the XML rec."""
        from_encoding=string.lower(from_encoding)
        to_encoding=string.lower(to_encoding)

        if from_encoding==to_encoding:
            return 1
        
        try:
            return self.__map[from_encoding].has_key(to_encoding)
        except KeyError:
            return 0

    def get_converter(self,from_encoding,to_encoding):
        """Returns a converter function that converts from the character
        encoding from_encoding to to_encoding. A KeyError will be thrown
        if no converter is known."""
        from_encoding=self._canonize_name(from_encoding)
        to_encoding=self._canonize_name(to_encoding)

        if from_encoding==to_encoding:
            return id_conv
        else:
            return self.__map[from_encoding][to_encoding]

    def add_converter(self,from_encoding,to_encoding,converter):
        from_encoding=self._canonize_name(from_encoding)
        to_encoding=self._canonize_name(to_encoding)
        
        if not self.__map.has_key(from_encoding):
            self.__map[from_encoding]={}

        self.__map[from_encoding][to_encoding]=converter

    def _canonize_name(self,name):
        "Returns the canonical form of a charset name."
        name=string.lower(name)
        if self.__alias_map.has_key(name):
            return self.__alias_map[name]
        else:
            return name
        
# --- Globals

convdb=ConverterDatabase()
convdb.add_alias("US-ASCII","ANSI_X3.4-1968")
convdb.add_alias("US-ASCII","iso-ir-6")
convdb.add_alias("US-ASCII","ANSI_X3.4-1986")
convdb.add_alias("US-ASCII","ISO_646.irv:1991")
convdb.add_alias("US-ASCII","ASCII")
convdb.add_alias("US-ASCII","ISO646-US")
convdb.add_alias("US-ASCII","us")
convdb.add_alias("US-ASCII","IBM367")
convdb.add_alias("US-ASCII","cp367")
convdb.add_alias("US-ASCII","csASCII")

convdb.add_alias("ISO-8859-1","ISO_8859-1:1987")
convdb.add_alias("ISO-8859-1","iso-ir-100")
convdb.add_alias("ISO-8859-1","ISO_8859-1")
convdb.add_alias("ISO-8859-1","latin1")
convdb.add_alias("ISO-8859-1","l1")
convdb.add_alias("ISO-8859-1","IBM819")
convdb.add_alias("ISO-8859-1","CP819")
convdb.add_alias("ISO-8859-1","csISOLatin1")

convdb.add_alias("IBM850","cp850")
convdb.add_alias("IBM850","850")
convdb.add_alias("IBM850","csPC850Multilingual")

convdb.add_converter("UTF-8","ISO-8859-1",utf8_to_iso8859)
convdb.add_converter("CP850","ISO-8859-1",cp850_to_iso8859)
convdb.add_converter("ISO-8859-1","CP850",iso8859_to_cp850)
convdb.add_converter("ISO-8859-1","UTF-8",iso8859_to_utf8)
convdb.add_converter("US-ASCII","UTF-8",id_conv)
convdb.add_converter("US-ASCII","ISO-8859-1",id_conv)
convdb.add_converter("US-ASCII","CP850",id_conv)
convdb.add_converter("UTF-8","CP850",utf8_to_cp850)
convdb.add_converter("CP850","UTF-8",cp850_to_utf8)