File: strs.c

package info (click to toggle)
msort 8.53-3
links: PTS, VCS
area: main
in suites: forky, sid
size: 2,364 kB
sloc: sh: 10,138; ansic: 10,031; makefile: 52
file content (436 lines) | stat: -rw-r--r-- 10,740 bytes
parent folder | download | duplicates (6)
/*
 * Copyright (C) 1993-2007 William J. Poser.
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 3 of the GNU General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 * or go to the web page:  http://www.gnu.org/licenses/gpl.txt.
 */

#include "config.h"
#include "compdefs.h"

#include <stdlib.h>
#ifdef HAVE_STDINT_H
#include <stdint.h>
#endif
#include <stdio.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#ifdef HAVE_UNINUM_UNICODE_H
#include <uninum/unicode.h>
#else
#include "unicode.h"
#endif
#include "utf8error.h"
#include "exitcode.h"

#define TRUE 1
#define FALSE 0

/*
 * Copy a string and return a pointer to the new storage allocated
 * to contain it. This function differs from strcpy(3) in that it
 * allocates the necessary storage instead of just copying into
 * storage provided by the caller.
 */

char *
copy_string(char *string)
{
  char *new;
   
  if((new = (char *) malloc( (size_t) (strlen(string) + 1) ) ) != NULL){
    strcpy(new,string);
  }
  else {
    fprintf(stderr,"copy_string: out of memory\n");
    exit(OUTOFMEMORY);
  }
  return(new);
}

/*
 * Allocate storage for a string of the required length and
 * return a pointer to it. The string is initialized to null
 * by placing a null character in its first byte.
 */

wchar_t *
wCreateString(int length)
{
  wchar_t *nptr;

  nptr = (wchar_t *) malloc(sizeof(wchar_t) * (size_t) (length + 1));
  if(nptr != NULL) nptr[0] = L'\0';
  return(nptr);
}

/*
 * Copy a wide string and return a pointer to the new storage allocated
 * to contain it. Equivalent to GNU wcsdup.
 */


wchar_t *WCopyString(wchar_t *string)
{
  wchar_t *new;
   
  if((new = (wchar_t *) malloc( (size_t) ((wcslen(string) + 1) * sizeof(wchar_t)) ) ) != NULL){
    wcscpy(new,string);
  }
  return(new);
}

/*
 * Copy a range from a wide string and return a pointer to the new storage allocated
 * to contain it. Assume that the string is known to be long enough.
 */

wchar_t *wcCopyRange(wchar_t *string,long first,long last)
{
  wchar_t *new;
  long len;
  long i;
   
  len = last - first +1;
  if((new = (wchar_t *) malloc( (size_t) ((len + 1) * sizeof(wchar_t)) ) ) != NULL){
    for (i = 0; i < len; i++) {
      new[i] = string[i+first];
    }
    new[i] = L'\0';
  }
  return(new);
}

/* Return the number of characters in a null-terminated UTF-8 string */

static unsigned int utf8len (UTF8 *s) {
  int cnt = 0;
  while (*s != '\0') {
    if ((*s++ & 0xC0) != 0x80) cnt++;
  }
  return cnt;
}

static unsigned int utf16len(const UTF16 *s) {
  UTF16 c;
  int cnt = 0;

  while ((c = *s++) != 0) cnt++;
  return cnt;
}

static unsigned int utf32len(const UTF32 *s) {
  UTF32 c;
  int cnt = 0;

  while ((c = *s++) != 0) cnt++;
  return cnt;
}

/*
 * Magic values subtracted from a buffer value during UTF8 conversion.
 * This table contains as many values as there might be trailing bytes
 * in a UTF-8 sequence.
 */
static const UTF32 OffsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
					 0x03C82080UL, 0xFA082080UL, 0x82082080UL };

/*
 * Index into the table below with the first byte of a UTF-8 sequence to
 * get the number of bytes that should follow.
 */

static const char TrailingBytesForUTF8[256] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/*
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 * This must be called with the length pre-determined by the first byte.
 * If not calling this from ConvertUTF8to*, then the length can be set by:
 *	length = TrailingBytesForUTF8[*source]+1;
 */

static Boolean isLegalUTF8P(const UTF8 *source, int length) {
  UTF8 a;
  const UTF8 *srcptr = source+length;
  switch (length) {
  default: return FALSE;
    /* Everything else falls through when "TRUE"... */
  case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return FALSE;
  case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return FALSE;
  case 2: if ((a = (*--srcptr)) > 0xBF) return FALSE;
    switch (*source) {
      /* no fall-through in this inner switch */
    case 0xE0: if (a < 0xA0) return FALSE; break;
    case 0xF0: if (a < 0x90) return FALSE; break;
    case 0xF4: if (a > 0x8F) return FALSE; break;
    default:  if (a < 0x80) return FALSE;
    }
  case 1: if (*source >= 0x80 && *source < 0xC2) return FALSE;
    if (*source > 0xF4) return FALSE;
  }
  return TRUE;
}

/* 
 * Return a UTF32 translation of a UTF8 string in newly allocated memory.
 * The translation is pointed to by the variable t.
 * The return value is a status code.
 * 
 * We keep track of the maximum code since this allows
 * certain optimizations, e.g., if no characters outside
 * the ASCII range are encountered, we can use ASCII
 * case-folding, which is considerably faster than
 * Unicode case-folding.
 */

int u82u32 (UTF8 *s, wchar_t **t,wchar_t *maxptr) {
  wchar_t *new;
  wchar_t *n;
  wchar_t ch;
  wchar_t max;
  UTF8 *s0;
  UTF8 *cptr;
  int BytesNeeded;
  size_t StorageNeeded;

  max = *maxptr;
  StorageNeeded =  (size_t) ((utf8len(s) + 1) * sizeof(UTF32));
  if((new = (wchar_t *) malloc(StorageNeeded)) == NULL) {
    fprintf(stderr,"u82u32: malloc failure.\n");
    return(-1);
  }
  n = new;
  s0 = s;
  max = 0;
  while (*s != '\0') {
    cptr = s;
    BytesNeeded = (int) TrailingBytesForUTF8[*s];
    if(!isLegalUTF8P(cptr,BytesNeeded+1)){
      fprintf(stderr,"u82u32: invalid UTF-8 input at byte %d.\n",(cptr-s0)+1);
      return(UTF8_BADINCODE);
    }
    ch = 0;
    switch (BytesNeeded) {
    case 5:	ch += *cptr++; ch <<= 6;
    case 4:	ch += *cptr++; ch <<= 6;
    case 3:	ch += *cptr++; ch <<= 6;
    case 2:	ch += *cptr++; ch <<= 6;
    case 1:	ch += *cptr++; ch <<= 6;
    case 0:	ch += *cptr++;
    }
    ch -= OffsetsFromUTF8[BytesNeeded];
    *n++ = ch;
    if(ch > max) max = ch;
    s+= (BytesNeeded +1);
  }
  *n = L'\0';
  *t = new;
  *maxptr = max;
  return 0;
}

UTF8 *
strcpyu8(UTF8 *t, UTF8 *s) {
  while(*s != '\0') {
    *t++ = *s++;
  }
  return (t);
}

/* 
 * Convert a UTF32 character to UTF8, placing the result in the
 * supplied buffer, which must be large enough to accomodate it.
 * Return the number of bytes in the UTF8 sequence.
 */

int
wc2utf8(UTF8 *t, wchar_t c){
  UTF8 *to;

  to = t;
  if (c < 0x80) {		/* ASCII */
    *t++ = c;
  }
  else if (c < 0x800) {
    *t++ = (0xC0 | c>>6);
    *t++ = (0x80 | (c & 0x3F));
  }
  else if (c < 0x10000) {
    *t++ = (0xE0 | c>>12);
    *t++ = (0x80 | (c>>6 & 0x3F));
    *t++ = (0x80 | (c & 0x3F));
  }
  else if (c < 0x200000) {	/* 2^21 */
    *t++ = (0xF0 | c>>18);
    *t++ = (0x80 | (c>>12 & 0x3F));
    *t++ = (0x80 | (c>>6 & 0x3F));
    *t++ = (0x80 | (c & 0x3F));
  }
  *t = 0x00;
  return (t-to);
}


/*
 *  Convert a UTF-32 string to UTF-8 in newly allocated storage.
 */

UTF8 *
ws2u8(wchar_t *s) {
  wchar_t c;
  UTF8 *t;
  UTF8 *to;
  int cnt;

  t = (UTF8 *) malloc((wcslen(s)+1) * sizeof(UTF32));
  if(t == NULL) exit(OUTOFMEMORY);

  to = t;
  cnt = 0;
  while ( (c = *s++) != L'\0') {
    if (c < 0x80) {		/* ASCII */
      *t++ = c;
      cnt++; 
    }
    else if (c < 0x800) {
      *t++ = (0xC0 | c>>6);cnt++;
      *t++ = (0x80 | (c & 0x3F));cnt++;
    }
    else if (c < 0x10000) {
      *t++ = (0xE0 | c>>12);cnt++;
      *t++ = (0x80 | (c>>6 & 0x3F));cnt++;
      *t++ = (0x80 | (c & 0x3F));cnt++;
    }
    else if (c < 0x200000) {	/* 2^21 */
      *t++ = (0xF0 | c>>18);cnt++;
      *t++ = (0x80 | (c>>12 & 0x3F));cnt++;
      *t++ = (0x80 | (c>>6 & 0x3F));cnt++;
      *t++ = (0x80 | (c & 0x3F));cnt++;
    }
  }
  *t = '\0';cnt++;
  to = (UTF8 *)realloc((void *)to,cnt);
  if(to == NULL) exit(OUTOFMEMORY);
  return (to);
}


/*
 * Append a range from a wide string to another wide string.
 * Assume that there is enough space.
 */

wchar_t *
wcCatRange(wchar_t *dest, wchar_t *src,long first,long last)
{
  long len;
  long i;

  while(*dest != L'\0') dest++;	/* Find end of dest */
  len = last - first +1;
  for (i = 0; i < len; i++) dest[i] = src[i+first];
  dest[i] = L'\0';
  return(dest);
}

static const int halfShift	= 10; /* used for shifting by 10 bits */
static const UTF32 halfBase	= 0x0010000UL;
static const UTF32 halfMask	= 0x3FFUL;


#define ERROR_OUTOFMEMORY  (-2)
#define SURROGATE_ERROR (-3)
#define EXCEEDS_UTF16_ERROR  (-4)


UTF16 *ConvertUTF32toUTF16 (const UTF32* s, int *chcnt) {
  UTF32 c;
  UTF16 *t;
  UTF16 *n;

  n = malloc(sizeof(UTF16) * ((2 * utf32len(s)) + 1));
  if(!n) {
    *chcnt = ERROR_OUTOFMEMORY;
    return NULL;
  }

  t = n;
  while ((c = *s++) != 0) {
    if (c <= UNI_MAX_BMP) {
      if ((c >= UNI_SUR_HIGH_START) && (c <= UNI_SUR_LOW_END)) {
	*chcnt = SURROGATE_ERROR;
	return NULL;
      }
      else *t++ = c;
    } else {
      if (c > UNI_MAX_UTF16) {
	*chcnt = EXCEEDS_UTF16_ERROR;
	return NULL;
      }
      else {
	c -= halfBase;
	*t++ = (c >> halfShift) + UNI_SUR_HIGH_START;
	*t++ = (c & halfMask) + UNI_SUR_LOW_START;
      }
    }
  }
  *t = 0;
  *chcnt = t - n;
  return n;
}

UTF32 *ConvertUTF16toUTF32 (const UTF16* s, int *chcnt) {
  UTF16 c;
  UTF32 c2;
  UTF32 *t;
  UTF32 *n;

  n = malloc(sizeof(UTF32) * ((2 * utf16len(s)) + 1));
  if(!n) {
    *chcnt = ERROR_OUTOFMEMORY;
    return NULL;
  }

  t = n;
  while ((c = *s++) != 0) {
    if (c >= UNI_SUR_HIGH_START && c <= UNI_SUR_HIGH_END && (*s != 0)) {
      c2 = *s;
      if (c2 >= UNI_SUR_LOW_START && c2 <= UNI_SUR_LOW_END) {
	c = ((c - UNI_SUR_HIGH_START) << halfShift)
	  + (c2 - UNI_SUR_LOW_START) + halfBase;
	++s;
      }
      else {			/* Unpaired high surrogate */
	*chcnt = SURROGATE_ERROR;
	return NULL;
      }
    } else if ((c >= UNI_SUR_LOW_START && c <= UNI_SUR_LOW_END)) { /* an unpaired low surrogate */
      *chcnt = SURROGATE_ERROR;
      return NULL;
    }
    *t++ = c;
  }
  *t = 0;
  *chcnt = t - n;
  return n;
}