File: cstools.c

package info (click to toggle)
libapache-csacek 2.1.9-4
links: PTS
area: main
in suites: etch, etch-m68k
size: 1,500 kB
ctags: 1,773
sloc: ansic: 11,833; makefile: 454; yacc: 199; sh: 164; php: 51; sed: 5
file content (758 lines) | stat: -rw-r--r-- 22,986 bytes
parent folder | download | duplicates (3)
#line 2 "cstools.c"
/*-
 * C-SaCzech
 * Copyright (c) 1996-2002 Jaromir Dolecek <dolecek@ics.muni.cz>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by Jaromir Dolecek
 *	for the CSacek project.
 * 4. The name of Jaromir Dolecek may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY JAROMIR DOLECEK ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL JAROMIR DOLECEK BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* $Id: cstools.c,v 1.70 2002/02/03 11:13:41 dolecek Exp $ */

#include "cstools.h"

/* local functions */
static int cstools_writeuni  __P((cstools_t code, cstools_unicode_t, unsigned char*));
static int cstools_readuni   __P((cstools_t code,const unsigned char *,cstools_unicode_t *));
static unsigned char cstools_finduni __P((cstools_unicode_t));
static int cstools_cy_compile __P((cstools_cstocs_t *, cstools_t, cstools_t));

/* aliases for supported encodings */
typedef struct       
{                     
	size_t		len;
	const char	*name;
	cstools_t	 code;
} cstools_tabitem_t;

static const cstools_tabitem_t
cstools_itab[] =
{
  {  2,	"l1",		CSTOOLS_ISOLatin1	},
  {  2,	"l2",		CSTOOLS_ISOLatin2	},
  {  3,	"ASC",		CSTOOLS_ASCII		},
  {  3,	"IL1",		CSTOOLS_ISOLatin1	},
  {  3,	"IL2",		CSTOOLS_ISOLatin2	},
  {  3,	"Win",		CSTOOLS_CP1250		},
  {  3,	"Kam",		CSTOOLS_Kam		},
  {  3,	"852",		CSTOOLS_PCLatin2	},
  {  3,	"Mac",		CSTOOLS_Mac		},
  {  3,	"850",		CSTOOLS_CP850		},
  {  3,	"Uni",		CSTOOLS_Unicode		},
  {  3,	"lat",		CSTOOLS_PCLatin2	},
  {  3,	"iso",		CSTOOLS_ISOLatin2	},
  {  3,	"koi",		CSTOOLS_KOI8_CS		},
  {  4,	"1250",		CSTOOLS_CP1250		},
  {  4,	"Koi8",		CSTOOLS_KOI8_CS		},
  {  4,	"UTF8",		CSTOOLS_UTF8		},
  {  5,	"ASCII",       	CSTOOLS_ASCII		},
  {  5,	"CP819",	CSTOOLS_ISOLatin1	},
  {  5,	"CP852",	CSTOOLS_PCLatin2	},
  {  5,	"cp850",	CSTOOLS_CP850		},
  {  5,	"cp866",	CSTOOLS_CP866		},
  {  5,	"UTF-8",	CSTOOLS_UTF8		},
  {  6,	"latin1",	CSTOOLS_ISOLatin1	},
  {  6,	"IBM819",	CSTOOLS_ISOLatin1	},
  {  6,	"latin2",	CSTOOLS_ISOLatin2	},
  {  6,	"cp1251",	CSTOOLS_CP1251		},
  {  6,	"CP1250",	CSTOOLS_CP1250		},
  {  6,	"Latin2",	CSTOOLS_PCLatin2	},
  {  6,	"IBM852",	CSTOOLS_PCLatin2	},
  {  6,	"Koi8CS",	CSTOOLS_KOI8_CS		},
  {  6,	"koi8-r",	CSTOOLS_KOI8_R		},
  {  7,	"Windows",	CSTOOLS_CP1250		},
  {  7,	"KEYBCS2",	CSTOOLS_Kam		},
  {  7,	"KOI8-CS",	CSTOOLS_KOI8_CS		},
  {  7,	"Unicode",	CSTOOLS_Unicode		},
  {  8,	"us-ascii",	CSTOOLS_ASCII		},
  {  8,	"Kamenici",	CSTOOLS_Kam		},
  {  8,	"x-kam-cs",	CSTOOLS_Kam		},
  {  8,	"PCLatin2",	CSTOOLS_PCLatin2	},
  {  8,	"csPCp852",	CSTOOLS_PCLatin2	},
  {  8,	"x-mac-ce",	CSTOOLS_Mac		},
  {  9,	"ISOLatin1",	CSTOOLS_ISOLatin1	},
  {  9,	"ISO8859-1",	CSTOOLS_ISOLatin1	},
  {  9,	"ISOLatin2",	CSTOOLS_ISOLatin2	},
  {  9,	"ISO8859-2",	CSTOOLS_ISOLatin2	},
  {  9,	"Macintosh",	CSTOOLS_Mac		},
  { 10,	"ISO-8859-1",	CSTOOLS_ISOLatin1	},
  { 10,	"iso-ir-100",	CSTOOLS_ISOLatin1	},
  { 10,	"ISO_8859-1",	CSTOOLS_ISOLatin1	},
  { 10,	"ISO-8859-2",	CSTOOLS_ISOLatin2	},
  { 10,	"iso-ir-101",	CSTOOLS_ISOLatin2	},
  { 10,	"ISO_8859-2",	CSTOOLS_ISOLatin2	},
  { 10,	"csn_369103",	CSTOOLS_KOI8_CS		},
  { 11,	"ISO Latin 1",	CSTOOLS_ISOLatin1	},
  { 11,	"ISO-Latin-1",	CSTOOLS_ISOLatin1	},
  { 11,	"csISOLatin1",	CSTOOLS_ISOLatin1	},
  { 11,	"ISO Latin 2",	CSTOOLS_ISOLatin2	},
  { 11,	"ISO-Latin-2",	CSTOOLS_ISOLatin2	},
  { 11,	"csISOLatin2",	CSTOOLS_ISOLatin2	},
  { 11,	"unicode-1-1",	CSTOOLS_Unicode		},
  { 12,	"windows-1250", CSTOOLS_CP1250		},
  { 12,	"windows-1251", CSTOOLS_CP1251		},
  { 16,	"unicode-1-1-utf-8",	CSTOOLS_UTF8	},
  { 18,	"csPC850Multilingual",	CSTOOLS_CP850	},
  {  0,	NULL, CSTOOLS_UNKNOWN }
}; /* cstools_itab[] */

static const char * const
cstools_names[] = 
{
  "ASCII",	"ASCII",	"asc",	"us-ascii",
  "ISO-8859-1", "ISO-8859-1",   "il1",	"iso-8859-1",
  "ISO-8859-2", "ISO-8859-2",	"il2",	"iso-8859-2",
  "CP1250",	"MS Win",	"win",	"windows-1250",
  "KEYBCS2",	"Kam",		"kam",	"x-kam-cs",
  "CP852",	"PC Latin2",	"pc2",	"cp852",
  "KOI8-CS",	"KOI8-CS",	"koi",	"csn_369103",
  "MAC",	"Mac",		"mac",	"x-mac-ce",
  "CP850",	"IBM 850",	"850",	"cp850",
  "KOI8-R",	"KOI8-R",	"koi8-r", "koi8-r",
  "ISO-8859-5",	"ISO-8859-5",	"ic5",	"iso-8859-5",
  "CP866",	"CP866",	"866",	"cp866",
  "CP1251",	"CP1251",	"win1251",  "windows-1251",
  "UTF8",	"UTF-8",	"utf8", "utf-8",
  NULL
};

/* used to compile recoding table */
static const unsigned char * const
cstools_map[] =
{
   /* ASCII */
   (const unsigned char *) "AAAAACCCDDEEEEIILLLNNOOOORRSSSTTUUUUYZZZaaaaacccddeeeeiilllnnoooorrsssttuuuuyzzz/.\"\"-'' x'''\"oS",
   /* ISO-8859-1 */
   (const unsigned char *) "AACCDEELLLNNORRSSSTTUUZZZaaccdeelllnnorrsssttuuzzz.\"\"-״''\"",
   /* ISO-8859-2 */
   (const unsigned char *) "ġţݮ峵๺.\"\"-''״''\"",
   /* Windows-1250 */
   (const unsigned char *) "ĥţݎ峾״",
   /* Kam */
   (const unsigned char *) "AAACCDEEILNOSSTUZZaaaccdeeilnosstuzz.\"\"-'' x'''\"oS",
   /* PC Latin2 */
   (const unsigned char *) "Ǝѐӷב⊙渗ǄЂء墓筘짫.\"\"-Ք",
   /* KOI-8 CS */
   (const unsigned char *) "AACCDEEILNOSSTUZZaaaccdeeilnosstuzz/.\"\"-'' x'''\"oS",
   /* MAC */
   (const unsigned char *) "ACDEI̅STacdei˗Κstx'Ք",
   /* CP 850 */
   (const unsigned char *) "AACCDDEELLLNNORRSSSTTUUZZZaaccddeelllnnorrsssttuuzzz.\"\"-''\"",
   /* end */
   NULL
};

/* table for mapping between cyrillic encodings */
static const unsigned char * const
cstools_cy_map[] = {
	/* KOI8-R */
	(const unsigned char *)
	"",
	/* iso-8859-5 */
	(const unsigned char *)
	"ΰƴĳö˷",
	/* cp866 */
	(const unsigned char *)
	"椥䣪㦢Ꞁ􉊋",
	/* windows-1251 */
	(const unsigned char *)
	"꿼ʯ",
	/* end */
	NULL
};
	
/*
 * map table between windows-1250 and Unicode 2.0 -- iso-8859-2 is
 * not suitable, because it doesn't include a few typographical chars
 * windows-1250 has; MAC encoding could be used, but windows-1250
 * has at least _some_ characters same as iso-8859-2 -- MAC chars are almost
 * totally different 
 */
static const cstools_unicode_t
cstools_unicodemap[][2] = {

	/* upper-case letters */
	{	(cstools_unicode_t) '',	0x00C1 },
	{	(cstools_unicode_t) 'A',	0x00C2 },
	{	(cstools_unicode_t) '',	0x0102 },
	{	(cstools_unicode_t) '',	0x00C4 },
	{	(cstools_unicode_t) '',	0x0104 },
	{	(cstools_unicode_t) '',	0x0106 },
	{	(cstools_unicode_t) 'C',	0x00C7 },
	{	(cstools_unicode_t) '',	0x010C },
	{	(cstools_unicode_t) '',	0x010E },
	{	(cstools_unicode_t) 'D',	0x0110 },
	{	(cstools_unicode_t) '',	0x00C9 },
	{	(cstools_unicode_t) '',	0x0118 },
	{	(cstools_unicode_t) 'E',	0x00CB },
	{	(cstools_unicode_t) '',	0x011A },
	{	(cstools_unicode_t) 'I',	0x00CD },
	{	(cstools_unicode_t) '',	0x00CE },
	{	(cstools_unicode_t) '',	0x0139 },
	{	(cstools_unicode_t) '',	0x0141 },
	{	(cstools_unicode_t) '',	0x013D },
	{	(cstools_unicode_t) '',	0x0143 },
	{	(cstools_unicode_t) '',	0x0147 },
	{	(cstools_unicode_t) '',	0x00D3 },
	{	(cstools_unicode_t) '',	0x00D4 },
	{	(cstools_unicode_t) '',	0x00D5 },
	{	(cstools_unicode_t) '',	0x00D6 },
	{	(cstools_unicode_t) '',	0x0158 },
	{	(cstools_unicode_t) '',	0x0154 },
	{	(cstools_unicode_t) '',	0x0160 },
	{	(cstools_unicode_t) 'S',	0x015E },
	{	(cstools_unicode_t) '',	0x015A },
	{	(cstools_unicode_t) '',	0x0164 },
	{	(cstools_unicode_t) 'T',	0x0162 },
	{	(cstools_unicode_t) '',	0x016E },
	{	(cstools_unicode_t) '',	0x00DA },
	{	(cstools_unicode_t) '',	0x0168 },
	{	(cstools_unicode_t) '',	0x00DC },
	{	(cstools_unicode_t) '',	0x00DD },
	{	(cstools_unicode_t) '',	0x017D },
	{	(cstools_unicode_t) '',	0x0179 },
	{	(cstools_unicode_t) '',	0x017B },

	/* lower-case letters */
	{	(cstools_unicode_t) '',	0x00E1 },
	{	(cstools_unicode_t) '',	0x00E2 },
	{	(cstools_unicode_t) '',	0x0103 },
	{	(cstools_unicode_t) '',	0x00E4 },
	{	(cstools_unicode_t) '',	0x0105 },
	{	(cstools_unicode_t) '',	0x0107 },
	{	(cstools_unicode_t) '',	0x00E7 },
	{	(cstools_unicode_t) '',	0x010D },
	{	(cstools_unicode_t) '',	0x010F },
	{	(cstools_unicode_t) '',	0x0111 },
	{	(cstools_unicode_t) '',	0x00E9 },
	{	(cstools_unicode_t) '',	0x0119 },
	{	(cstools_unicode_t) '',	0x00CC },
	{	(cstools_unicode_t) '',	0x011B },
	{	(cstools_unicode_t) '',	0x00ED },
	{	(cstools_unicode_t) '',	0x00EE },
	{	(cstools_unicode_t) '',	0x0140 },
	{	(cstools_unicode_t) '',	0x0142 },
	{	(cstools_unicode_t) '',	0x013E },
	{	(cstools_unicode_t) '',	0x0144 },
	{	(cstools_unicode_t) '',	0x0148 },
	{	(cstools_unicode_t) '',	0x00F3 },
	{	(cstools_unicode_t) '',	0x00F4 },
	{	(cstools_unicode_t) '',	0x00F5 },
	{	(cstools_unicode_t) '',	0x00F6 },
	{	(cstools_unicode_t) '',	0x0159 },
	{	(cstools_unicode_t) '',	0x0155 },
	{	(cstools_unicode_t) '',	0x0161 },
	{	(cstools_unicode_t) '',	0x015F },
	{	(cstools_unicode_t) '',	0x015B },
	{	(cstools_unicode_t) '',	0x0165 },
	{	(cstools_unicode_t) '',	0x0163 },
	{	(cstools_unicode_t) '',	0x016F },
	{	(cstools_unicode_t) '',	0x00FA },
	{	(cstools_unicode_t) '',	0x0169 },
	{	(cstools_unicode_t) '',	0x00FC },
	{	(cstools_unicode_t) '',	0x00FD },
	{	(cstools_unicode_t) '',	0x017E },
	{	(cstools_unicode_t) '',	0x017A },
	{	(cstools_unicode_t) '',	0x017C },

	/* special characters */
	{	(cstools_unicode_t) '',	0x00F7 }, /* division sign */
   	{	(cstools_unicode_t) '',	0x2026 }, /* hor. ellipsis */
	{	(cstools_unicode_t) '',	0x201E }, /* doub.low-9 q.m.*/
	{	(cstools_unicode_t) '',	0x201C }, /* l.doub.quot.mark */
	{	(cstools_unicode_t) '',	0x2013 }, /* en dash */
	{	(cstools_unicode_t) '',	0x00AB }, /* l.d.ang. q.m. */
	{	(cstools_unicode_t) '',	0x00BB }, /* r.d.ang. q.m. */
	{	(cstools_unicode_t) '',	0x00A0 }, /* no break space */
	{	(cstools_unicode_t) '',	0x00D7 }, /* multiplic. sign */
	{	(cstools_unicode_t) '',	0x00B4 }, /* apostroph */
	{	(cstools_unicode_t) '',	0x2018 }, /* l.sing.quot.mark */
	{	(cstools_unicode_t) '',	0x2019 }, /* r.sing.quot.mark */
	{	(cstools_unicode_t) '',	0x201D }, /* r.doub.quot.mark */

	/* last item */
	{	(cstools_unicode_t) '\0',	0x0000 },
};

/*
 * returns number of font given as parameter, or -1 in case of bad name
 * compares max. length characters from given name; if length is < 0,
 * length is treated as length of whole string ``name''
 */
cstools_t
cstools_whichcode(name, length)
  const char *name;
  size_t length;
{
   int i;

   if (length == 0)
	length = strlen(name);

   for(i=0; cstools_itab[i].name && cstools_itab[i].len <= length; i++) {
     if (cstools_itab[i].len == length
	 && CSTOOLS_UPPER(name[0]) == CSTOOLS_UPPER(cstools_itab[i].name[0])
	 && strncasecmp(name, cstools_itab[i].name, length) == 0)
        return cstools_itab[i].code;
   }

   return CSTOOLS_UNKNOWN;
}

/*
 * compiles recoding table for coding from encoding ``from'' to encoding
 * ``to''; ``from'' and ``to'' are values got previously by
 * call to cstools_whichcode() or by direct CSTOOLS_FOO constant assigment
 */
int 
cstools_init(mp, from, to)
  cstools_cstocs_t *mp;
  cstools_t from, to;
{
	int i;
	unsigned char z;
	const unsigned char *frommap, *tomap;

	if ( from < CSTOOLS_MINCODE || to < CSTOOLS_MINCODE )
		return -1;

	/* set up default values */
	for(i=0; i<256 ; i++) {
		mp->map[i] = (i & 0x80) ? '_' : i;
	}

	/* set up attributes of recoding table */
	mp->source = from;
	mp->target = to;

	/* no recoding table between Unicodes */
	if (CSTOOLS_ISUNICODE(from) && CSTOOLS_ISUNICODE(to))
		return 0;

	if (CSTOOLS_ISUNICODE(from) && CSTOOLS_ISUNICODE(to))
		mp->typeconv = CSTOOLS_BOTHUNI;
	else if (CSTOOLS_ISUNICODE(from))
		mp->typeconv = CSTOOLS_FROMUNI;
	else if (CSTOOLS_ISUNICODE(to))
		mp->typeconv = CSTOOLS_TOUNI;
	else
		mp->typeconv = CSTOOLS_NOUNI;

	/* XXX for now, recoding between cyrilic & latin or Unicode
	 * is not supported - i.e. all chars >= 128 are filtered out */
	if (CSTOOLS_ISCYRILLIC(from) != CSTOOLS_ISCYRILLIC(to))
		return 0;
	else if (CSTOOLS_ISCYRILLIC(from))
		return cstools_cy_compile(mp, from, to);

	/* set up recoding map */
	if (CSTOOLS_ISUNICODE(to)) {
		frommap = cstools_map[from];
  		for(i=0; frommap[i]; i++ )
  		{
			z = frommap[i];
			if (z > 127) mp->map[z] = cstools_unicodemap[i][1];
		}
	}
	else {
		if (CSTOOLS_ISUNICODE(from)) from = CSTOOLS_CP1250;

		frommap = cstools_map[from];
		tomap   = cstools_map[to];
		for(i=0; frommap[i]; i++ )
		{
			z = frommap[i];
			if ( z > 127 ) mp->map[z] = tomap[i];
		}
	}

  return 0;
}

/*
 * writes unicode value encoded to ``code'' into string ``dst''
 * returns number of characters written
 */ 
static int
cstools_writeuni(code, unichar, dst)
  cstools_t	code;
  cstools_unicode_t	unichar;
  unsigned char *dst;
{
	if (code == CSTOOLS_Unicode) {
		dst[0] = (unichar & 0xff00) >> 8;
		dst[1] = (unichar & 0xff);
		return 2;
	}
	else if (code == CSTOOLS_UTF8) {
		int written = 0;

		if (unichar < 0x80) {
			dst[0] = unichar & 0xff;
			written = 1;
		}
		else if (unichar < 0x800) {
			dst[0] = 0xc0 | (unichar >> 6);
			dst[1] = 0x80 | (unichar & 0x3f);
			written = 2;
		}
		else if (unichar < 0x10000) {
			dst[0] = 0xe0 | (unichar >> 12);
			dst[1] = 0x80 | ((unichar >> 6) & 0x3f);
			dst[2] = 0x80 | (unichar & 0x3f);
			written = 3;
		}

		return written;
	}
		
	return 0;
}

/*
 * reads one unicode char encoded in ``code'' from ``src'' and stores
 * result in ``*unichar''
 * returns number of characters read
 */
static int
cstools_readuni(code, src, unichar)
  cstools_t code;
  const unsigned char *src;
  cstools_unicode_t *unichar;
{
	if (code == CSTOOLS_Unicode) {
		*unichar = (src[0] * 0x100) + src[1];
		return 2;
	}
	else if (code == CSTOOLS_UTF8) {
		int cread = 0; 

		if (src[0] < 0xc0) {
			*unichar = src[0];
			cread = 1;
		}
		else if (src[0] < 0xf0) {
			*unichar = ((src[0] & 0x1f) << 6) | ( src[1] & 0x3f);
			if (src[0] < 0xe0)
				cread = 2;
			else {
				*unichar <<= 6;
				*unichar |= src[2] & 0x3f;
			}
		}
		else { /* skip invalid (for me) unicode char */
			unsigned int num = 0x80;
			for(;src[0] & num; num >>= 1) cread++;
			*unichar = 0;
		}

		return cread;
	}

	return 0;
}

/*
 * finds ``unichar'' in cstools_unicodemap and returns appropriate
 * windows-1250 char
 */
static unsigned char
cstools_finduni(unichar)
  cstools_unicode_t unichar;
{
	cstools_unicode_t find = unichar;
	int i;
	
	/* 0-127 are ASCII, mapped directly */
	if (!unichar || (unichar & 0x7f)) return (unsigned char) unichar;
	else if (unichar & 0xff) {
		/* 128-255 are equal to iso-8859-1, so translate it */
		/* to windows-1250 */
		size_t idx;
		for(idx=0; cstools_map[CSTOOLS_ISOLatin1][idx]; idx++)
			if (cstools_map[CSTOOLS_ISOLatin1][idx] == unichar)
				return cstools_map[CSTOOLS_CP1250][idx];
	}

	for(i=0; cstools_unicodemap[i][0]; i++) {
		if (find == cstools_unicodemap[i][1])
			return cstools_unicodemap[i][0];
	} 

	return 0;
}


/*
 * recodes string from encoding ``from'' to encoding ``to'' using table
 * ``mp'' (it recompiles the table if it's not suitable)
 * it's applications responsibility to provide ``dst_orig'' long enough
 * to hold all the result
 * only ``src_len'' characters from ``src_orig'' are read; if ``src_len''
 * is negative, strlen(src_orig) characters is read
 * returns number of characters written to destination string
 */
int 
cstools_recode(mp, src_orig, dst_orig, src_len)
  cstools_cstocs_t *mp;
  const char *src_orig;
  char *dst_orig;
  size_t  src_len;
{
	const unsigned char *src = (const unsigned char *) src_orig;
	unsigned char *dst = (unsigned char *) dst_orig;
	unsigned char z;
	const unsigned char *end;
	int add;
	cstools_unicode_t one;

	if (src_len == 0) return 0;

	if (mp->source == mp->target) {
		/* nothing special needs to be done if both encodings are    */
		/* equal - ensure source data are present on the destination */
		/* only */
		if (dst_orig != src_orig) {
			/* LINTED */
			memcpy(dst_orig, src_orig, src_len);
		}
		return src_len;
	}

	/* this is set to point where we would stop reading the source */
	/* string */
	end = src + src_len;

	/* decide which recoding method to use */
	switch(mp->typeconv) {
	case CSTOOLS_NOUNI:
		/* from 8bit to 8bit encoding */
		for(; src < end; src++, dst++) {
			*dst = mp->map[*src] & 0xFF;
		}
		break;
	case CSTOOLS_BOTHUNI:
		/* Both codesets are Unicode */
		while((add = cstools_readuni(mp->source, src, &one)) != 0) {
			src += add;
			dst += cstools_writeuni(mp->target, one, dst);
			if (src >= end) break;
		}
		break;
	case CSTOOLS_FROMUNI:
		/* Unicode in, 8bit out */
		while((add = cstools_readuni(mp->source, src, &one)) != 0) {
			src+= add;
			z = cstools_finduni(one);
			*(dst++) = mp->map[z] & 0xFF;
			if (src >= end) break;
		}
		break;
	case CSTOOLS_TOUNI:
		/* 8bit in, Unicode out */
		for(; src < end; src++) {
			one = mp->map[*src];
			dst += cstools_writeuni(mp->target, one, dst);
		}
		break;
	}

	return (dst - (unsigned char *) dst_orig);
}

/*
 * tries to guess source encoding of text; if ``len'' is less than zero,
 * length of string ``tex'' is used
 *
 * returns CSTOOLS_BINARY if text contains any control characters,
 * CSTOOLS_UNKNOWN if encoding can't be guessed or appropriate
 * CSTOOLS_FOO code name constant
 */
cstools_t
cstools_guess_charset(o_text, len)
  const char *o_text;
  size_t len;
{
	size_t i;
	int is_ascii;
	unsigned char chars[128], isin[128], c, pc;
	const unsigned char *encmap, *text= (const unsigned char *)o_text;
	cstools_t retval = CSTOOLS_UNKNOWN;

	/* initialize array to zeroes */
	memset((void *)chars, 0, sizeof(chars));

	/*
	 * Find all chars used within the text; ASCII chars, \r, \n & \t
	 * are ignored; if any control character is found, text is assumed
	 * to be binary data.
	 * HTTP escape sequence %XY is supported and characters encoded
	 * this way are properly counted.
	 */
	is_ascii = 1;
	for(i=0; i<len; i++) {
		if (text[i] == '\n' || text[i] == '\r' || text[i] == '\t')
			continue;
		else if (text[i] < 32)
			return CSTOOLS_BINARY;
		else if (text[i] == '%' && len - i >= 2) {
			if (text[i+1] == '%') {
				i++;
				continue;
			} else if (len - i >= 3 && CSA_ISHEXA(text[i+1])
				   && CSA_ISHEXA(text[i+2])) {
				c = CSA_UPPER(text[i+1]);
				c = CSA_HEX2DEC(c) << 4;
				pc = CSA_UPPER(text[i+2]);
				c += CSA_HEX2DEC(pc);
				i += 2;		/* shift pointer */
				if (c < 128)
					continue;
			}
			else
				continue;
		} else if (text[i] < 128)
			continue;
		else
			c = text[i];

		if (is_ascii) is_ascii = 0;
		chars[c & 0x7f] = 1;
	}
	if (is_ascii) return CSTOOLS_ASCII;
		
	/* test characters against maps of chars in cstools_map[] (skip
	 * only the first entry (ASCII)); text is assumed to be in the
	 * appropriate encoding if all it's characters are part of
	 * encoding's charset */
	for(i=1; cstools_map[i]; i++)
	{
		/* initialize variables */
		memcpy(isin, chars, sizeof(isin));
		encmap = cstools_map[i];

		/* for each char in encmap, clear "used" flag in isin */
		for(; *encmap; encmap++) {
			if ((*encmap) >= 128) {
				isin[(*encmap)-128] = 0;
			}
		}

		/* if there are no more chars marked as used, text is */
		/* likely to be in currently selected encoding */
		if (!memchr(isin, 1, sizeof(isin))) {
			retval = cstools_index2code((int) i);
			/* if it's iso-8859-1, continue searching, it might
			 * be false match */
			if (retval != CSTOOLS_ISOLatin1)
				return retval;
		}
	}
	
	return (retval);
}

/*
 * returns appropriate name for encoding ``code'' as specified by 
 * ``which'' argument or NULL if ``code'' is invalid
 */
const char *
cstools_name(code, which)
  cstools_t code;
  int which;
{
	int idx;

	if (code < CSTOOLS_MINCODE || which < 0 || which > 4) return NULL;

	if (which == CSTOOLS_LAMPANAME) {
		which = CSTOOLS_MIMENAME;
		if (code == CSTOOLS_Kam || code == CSTOOLS_KOI8_CS
		    || code == CSTOOLS_Mac || code == CSTOOLS_CP850
		    || code == CSTOOLS_KOI8_R || code == CSTOOLS_CP866
		    || code == CSTOOLS_CP1251)
			code = CSTOOLS_ASCII;
	}

	idx = cstools_code2index(code);
	if (idx < 0) return NULL;

	return cstools_names[4*idx + which];
}

/*
 * returns index in cstools_names[] of code given as argument
 */
int
cstools_code2index(code)
   cstools_t code;
{
	if (code == CSTOOLS_UTF8)
		return (int) CSTOOLS_MAX8BIT + 1;
	else if (!CSTOOLS_ISUNICODE(code) && code >= CSTOOLS_MINCODE)
		return (int) code;
	else
		return -1;
}

/*
 * returns encoding code appropriate to index given as argument
 */
cstools_t
cstools_index2code(index)
    int index;
{
	if (index > (int)CSTOOLS_MAX8BIT) {
		return (cstools_t)
			(index - 1 - (int)CSTOOLS_MAX8BIT + (int)CSTOOLS_UTF8);
	}
	else if (index >= (int)CSTOOLS_MINCODE)
		return (cstools_t) index;
	else
		return CSTOOLS_UNKNOWN;
}

/*
 * prepares recoding table between two supported cyrillic charsets
 */
static int
cstools_cy_compile(mp, from, to)
  cstools_cstocs_t *mp;
  cstools_t from, to;
{
	int fromidx = (int)from - (int)CSTOOLS_KOI8_R;
	int toidx = (int)to - (int)CSTOOLS_KOI8_R;
	size_t i;

	for(i = 0; cstools_cy_map[fromidx][i]; i++) {
		/* all the chars are > 127, there is no need for checking */
		mp->map[cstools_cy_map[fromidx][i]] =
			cstools_cy_map[toidx][i];
	}
	return 0;
}