File: gen-utf.c

package info (click to toggle)
utfcheck 1.2-5
links: PTS, VCS
area: main
in suites: forky, sid
size: 2,360 kB
sloc: lex: 472; sh: 262; ansic: 130; makefile: 42
file content (213 lines) | stat: -rw-r--r-- 5,840 bytes
parent folder | download | duplicates (4)
/*
   gen-utf.c - generate ASCII, UTF-8, and UTF-16 test files.

   Paul Hardy, 2018
*/

#include <stdio.h>

int
main ()
{
   int i;       /* Loop variable */
   int codept;  /* Unicode code point to output */
   int nbytes;  /* Number of bytes in current UTF-8 character */

   unsigned utf_bytes[5]; /* For UTF-8 encoded bytes */

   int cvt2utf8 (unsigned, unsigned *); /* convert binary code point to UTF-8 */

   FILE *utffp;

   /*
      ASCII
   */
   utffp = fopen ("sample-ascii.txt", "w");
   for (codept = 0x00; codept < 0x7F; codept++)
      fputc (codept, utffp);
   fclose (utffp);

   /*
      Shortened UTF-8 file with Byte Order Mark at beginning.
   */
   utffp = fopen ("sample-utf8-bom-begin.txt", "w");
   /* UTF-8 Byte Order Mark */
   fputc ('\357', utffp); fputc ('\273', utffp); fputc ('\277', utffp);
   for (codept = 0x00; codept < 0x7F; codept++)
      fputc (codept, utffp);
   fclose (utffp);

   /*
      Shortened UTF-8 file with Byte Order Mark at end, to check embedded BOM.
   */
   utffp = fopen ("sample-utf8-bom-end.txt", "w");
   for (codept = 0x00; codept < 0x7F; codept++)
      fputc (codept, utffp);
   /* UTF-8 Byte Order Mark */
   fputc ('\357', utffp); fputc ('\273', utffp); fputc ('\277', utffp);
   fclose (utffp);

   /*
      UTF-8
   */
   utffp = fopen ("sample-utf8.txt", "w");
   for (codept = 0x00; codept < 0xD800; codept++) {
      nbytes = cvt2utf8 (codept, utf_bytes);
      for (i = 0; i < nbytes; i++)
         fputc (utf_bytes [i], utffp);
   }
   /* Skip over Unicode Surrogate Pair range; not valid UTF-8 */
   for (codept = 0xE000; codept <= 0x10FFFF; codept++) {
      nbytes = cvt2utf8 (codept, utf_bytes);
      for (i = 0; i < nbytes; i++)
         fputc (utf_bytes [i], utffp);
   }
   fclose (utffp);

   /*
      Big-endian UTF-16
   */
   utffp = fopen ("sample-utf16-be.txt", "w");
   /* Big-endian UTF-16 Byte Order Mark */
   fputc (0xFE, utffp);
   fputc (0xFF, utffp);
   for (codept = 0x0000; codept <= 0x0100; codept++) {
      fputc ((codept >> 8) & 0xFF, utffp);
      fputc ( codept       & 0xFF, utffp);
   }
   fclose (utffp);

   /*
      Little-endian UTF-16
   */
   utffp = fopen ("sample-utf16-le.txt", "w");
   /* Little-endian UTF-16 Byte Order Mark */
   fputc (0xFF, utffp);
   fputc (0xFE, utffp);
   for (codept = 0x0000; codept <= 0x0100; codept++) {
      fputc ( codept       & 0xFF, utffp);
      fputc ((codept >> 8) & 0xFF, utffp);
   }
   fclose (utffp);

   /*
      Binary
   */
   utffp = fopen ("sample-binary.txt", "w");
   for (codept = 0x00; codept <= 0xFF; codept++) {
      fputc ( codept       & 0xFF, utffp);
   }
   fclose (utffp);

   /*
      UTF-8 with embedded Surrogate Pairs -- not valid UTF-8
   */
   utffp = fopen ("sample-utf8-surrogate.txt", "w");
   nbytes = cvt2utf8 (0xD800, utf_bytes);
   for (i = 0; i < nbytes; i++)
      fputc (utf_bytes [i], utffp);
   nbytes = cvt2utf8 (0xDC00, utf_bytes);
   for (i = 0; i < nbytes; i++)
      fputc (utf_bytes [i], utffp);

}


/*
   Convert a Unicode code point to a UTF-8 string.
   The allowable Unicode range is U+0000..U+10FFFF.

   codept     - the Unicode code point to encode
   utf8_bytes - an array of 5 bytes to hold the UTF-8 encoded string;
                the string will consist of up to 4 UTF-8-encoded bytes,
                with null bytes after the last encoded byte to signal
                to the end of the array, utf8_bytes[4].
*/
int
cvt2utf8 (unsigned codept, unsigned *utf8_bytes)
{
   int bin_length;     /* number of binary digits, for forming UTF-8 */
   int byte_length;    /* numberof bytes of UTF-8                    */

   int bin_digits (unsigned);


   /*
      If codept is within the valid Unicode range of
      0x0 through 0x10FFFF inclusive, convert it to UTF-8.
   */
   if (codept <= 0x10FFFF) {

      byte_length = 0;
      bin_length  = bin_digits (codept);

      if (bin_length < 8) {        /* U+0000..U+007F */
         byte_length = 1;
         utf8_bytes [0] = codept;
         utf8_bytes [1] =
         utf8_bytes [2] =
         utf8_bytes [3] =
         utf8_bytes [4] = 0;
      }
      else if (bin_length < 12) {  /* U+0080..U+07FF */
         byte_length = 2;
         utf8_bytes [0] = 0xC0 | ((codept >>  6) & 0x1F);
         utf8_bytes [1] = 0x80 | ( codept        & 0x3F);
         utf8_bytes [2] =
         utf8_bytes [3] =
         utf8_bytes [4] = 0;
      }
      else if (bin_length < 17) {  /* U+0800..U+FFFF */
         byte_length = 3;
         utf8_bytes [0] = 0xE0 | ((codept >> 12) & 0x0F);
         utf8_bytes [1] = 0x80 | ((codept >>  6) & 0x3F);
         utf8_bytes [2] = 0x80 | ( codept        & 0x3F);
         utf8_bytes [3] =
         utf8_bytes [4] = 0;
      }
      else if (bin_length < 22) {  /* U+010000..U+10FFFF */
         byte_length = 4;
         utf8_bytes [0] = 0xF0 | ((codept >> 18) & 0x07);
         utf8_bytes [1] = 0x80 | ((codept >> 12) & 0x3F);
         utf8_bytes [2] = 0x80 | ((codept >>  6) & 0x3F);
         utf8_bytes [3] = 0x80 | ( codept        & 0x3F);
         utf8_bytes [4] = 0;
      }

   }  /* encoded output for valid Unicode code point */
   else { /* flag out of range Unicode code point */
      /*
         0xFF is never a valid UTF-8 code point, so testing
         for it will be an easy check of a valid return value.
      */
      byte_length = -1;
      utf8_bytes [0] = 0xFF;
      utf8_bytes [1] = 0xFF;
      utf8_bytes [2] = 0xFF;
      utf8_bytes [3] = 0xFF;
      utf8_bytes [4] = 0;
   }

   return byte_length;
}


/*
   Return the number of significant binary digits in an unsigned number.
*/
int
bin_digits (unsigned itest)
{
   unsigned i;
   int result;

   i = 0x80000000;  /* mask highest unsigned bit */
   result = 32;
   while (  (i != 0) && ((itest & i) == 0) ) {
       i >>= 1;
       result--;
   }

   return result;
}