File: genmap_support.py

package info (click to toggle)
python-cjkcodecs 1.1.1-1
links: PTS
area: main
in suites: sarge
size: 2,848 kB
ctags: 1,231
sloc: ansic: 33,819; python: 2,388; makefile: 70; sh: 11
file content (281 lines) | stat: -rw-r--r-- 9,632 bytes
#
# genmap_support.py: Multibyte Codec Map Generator
#
# Copyright (C) 2003-2004 Hye-Shik Chang <perky@FreeBSD.org>.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# $Id: genmap_support.py,v 1.6 2004/06/29 05:55:09 perky Exp $
#

import re

COPYRIGHT_HEADER = """\
/*
 * $Id: genmap_support.py,v 1.6 2004/06/29 05:55:09 perky Exp $
 */
"""

class BufferedFiller:
    def __init__(self, column=78):
        self.column = column
        self.buffered = []
        self.cline = []
        self.clen = 0
        self.count = 0
    def write(self, *data):
        for s in data:
            if len(s) > self.column:
                raise ValueError, "token is too long"
            if len(s) + self.clen > self.column:
                self.flush()
            self.clen += len(s)
            self.cline.append(s)
            self.count += 1
    def flush(self):
        if not self.cline:
            return
        self.buffered.append(''.join(self.cline))
        self.clen = 0
        del self.cline[:]
    def printout(self, fp):
        self.flush()
        for l in self.buffered:
            print >> fp, l
        del self.buffered[:]
    def __len__(self):
        return self.count

class UCMReader:
    def __init__(self, fp):
        self.file = fp
    def itertokens(self):
        isincharmap = False
        for line in self.file:
            body = line.split('#', 1)[0].strip()
            if body == 'CHARMAP':
                isincharmap = True
            elif body == 'END CHARMAP':
                isincharmap = False
            elif isincharmap:
                index, data = body.split(None, 1)
                index = int(index[2:-1], 16)
                data = self.parsedata(data)
                yield index, data
    def parsedata(self, data):
        return eval('"'+data.split()[0]+'"')

def printcopyright(fo):
    print >> fo, COPYRIGHT_HEADER

class EncodeMapWriter:
    filler_class = BufferedFiller
    elemtype = 'DBCHAR'
    indextype = 'struct unim_index'
    def __init__(self, fp, prefix, map):
        self.file = fp
        self.prefix = prefix
        self.filler = self.filler_class()
        self.buildmap(map)
        self.printmap(map)
    def buildmap(self, emap):
        for c1 in range(0, 256):
            if c1 not in emap:
                continue
            c2map = emap[c1]
            rc2values = c2map.keys()
            rc2values.sort()
            if not rc2values:
                continue

            c2map[self.prefix] = True
            c2map['min'] = rc2values[0]
            c2map['max'] = rc2values[-1]
            c2map['midx'] = len(self.filler)

            for v in range(rc2values[0], rc2values[-1] + 1):
                if v not in c2map:
                    self.write_nochar()
                elif isinstance(c2map[v], int):
                    self.write_char(c2map[v])
                elif isinstance(c2map[v], tuple):
                    self.write_multic(c2map[v])
                else:
                    raise ValueError
    def write_nochar(self):
        self.filler.write('N,')
    def write_multic(self, point):
        self.filler.write('M,')
    def write_char(self, point):
        self.filler.write(str(point) + ',')
    def printmap(self, fmap):
        print >> self.file, ("static const %s __%s_encmap[%d] = {" % (
                             self.elemtype, self.prefix, len(self.filler)))
        self.filler.printout(self.file)
        print >> self.file, "};"
        print >> self.file

        print >> self.file, "static const %s %s_encmap[256] = {" % (
                                self.indextype, self.prefix)
        for i in range(256):
            if i in fmap and self.prefix in fmap[i]:
                self.filler.write("{", "__%s_encmap" % self.prefix, "+",
                                  "%d" % fmap[i]['midx'], ",",
                                  "%d," % fmap[i]['min'],
                                  "%d" % fmap[i]['max'], "},")
            else:
                self.filler.write("{", "0,", "0,", "0", "},")
                continue
        self.filler.printout(self.file)
        print >> self.file, "};"
        print >> self.file

# XXX: convert all usages of this function to EncodeMapWriter
def genmap_encode(filler, prefix, emap):
    for c1 in range(0, 256):
        if not emap.has_key(c1):
            continue
        c2map = emap[c1]
        rc2values = c2map.keys()
        rc2values.sort()
        if not rc2values:
            continue

        c2map[prefix] = True
        c2map['min'] = rc2values[0]
        c2map['max'] = rc2values[-1]
        c2map['midx'] = len(filler)

        for v in range(rc2values[0], rc2values[-1] + 1):
            if not c2map.has_key(v):
                filler.write('N,')
            elif isinstance(c2map[v], int):
                filler.write(str(c2map[v]) + ',')
            elif isinstance(c2map[v], tuple):
                filler.write('M,')
            else:
                raise ValueError

def print_encmap(fo, filler, fmapprefix, fmap, f2map={}, f2mapprefix=''):
    print >> fo, ("static const DBCHAR __%s_encmap[%d] = {" % (
                        fmapprefix, len(filler)))
    filler.printout(fo)
    print >> fo, "};"
    print >> fo

    print >> fo, "static const struct unim_index %s_encmap[256] = {" % (fmapprefix)
    for i in range(256):
        if fmap.has_key(i) and fmap[i].has_key(fmapprefix):
            map = fmap
            prefix = fmapprefix
        elif f2map.has_key(i) and f2map[i].has_key(f2mapprefix):
            map = f2map
            prefix = f2mapprefix
        else:
            filler.write("{", "0,", "0,", "0", "},")
            continue

        filler.write("{", "__%s_encmap" % prefix, "+", "%d" % map[i]['midx'],
                     ",", "%d," % map[i]['min'], "%d" % map[i]['max'], "},")
    filler.printout(fo)
    print >> fo, "};"
    print >> fo

def genmap_decode(filler, prefix, c1range, c2range, dmap, onlymask=(),
                  wide=0):
    c2width  = c2range[1] - c2range[0] + 1
    c2values = range(c2range[0], c2range[1] + 1)

    for c1 in range(c1range[0], c1range[1] + 1):
        if not dmap.has_key(c1) or (onlymask and c1 not in onlymask):
            continue
        c2map = dmap[c1]
        rc2values = [n for n in c2values if c2map.has_key(n)]
        if not rc2values:
            continue

        c2map[prefix] = True
        c2map['min'] = rc2values[0]
        c2map['max'] = rc2values[-1]
        c2map['midx'] = len(filler)

        for v in range(rc2values[0], rc2values[-1] + 1):
            if c2map.has_key(v):
                filler.write('%d,' % c2map[v])
            else:
                filler.write('U,')

def print_decmap(fo, filler, fmapprefix, fmap, f2map={}, f2mapprefix='',
                 wide=0):
    if not wide:
        print >> fo, ("static const ucs2_t __%s_decmap[%d] = {" % (
                        fmapprefix, len(filler)))
        width = 8
    else:
        print >> fo, ("static const ucs4_t __%s_decmap[%d] = {" % (
                        fmapprefix, len(filler)))
        width = 4
    filler.printout(fo)
    print >> fo, "};"
    print >> fo

    if not wide:
        print >> fo, "static const struct dbcs_index %s_decmap[256] = {" % (
                        fmapprefix)
    else:
        print >> fo, "static const struct widedbcs_index %s_decmap[256] = {" % (
                        fmapprefix)

    for i in range(256):
        if fmap.has_key(i) and fmap[i].has_key(fmapprefix):
            map = fmap
            prefix = fmapprefix
        elif f2map.has_key(i) and f2map[i].has_key(f2mapprefix):
            map = f2map
            prefix = f2mapprefix
        else:
            filler.write("{", "0,", "0,", "0", "},")
            continue

        filler.write("{", "__%s_decmap" % prefix, "+", "%d" % map[i]['midx'],
                     ",", "%d," % map[i]['min'], "%d" % map[i]['max'], "},")
    filler.printout(fo)
    print >> fo, "};"
    print >> fo

def loadmap(fo, natcol=0, unicol=1, sbcs=0):
    print "Loading from", fo
    fo.seek(0, 0)
    decmap = {}
    for line in fo:
        line = line.split('#', 1)[0].strip()
        if not line or len(line.split()) < 2: continue

        row = map(eval, line.split())
        loc, uni = row[natcol], row[unicol]
        if loc >= 0x100 or sbcs:
            decmap.setdefault((loc >> 8), {})
            decmap[(loc >> 8)][(loc & 0xff)] = uni

    return decmap