File: utf8map.pl

package info (click to toggle)

kannel 1.4.5-22

links: PTS, VCS
area: main
in suites: forky, sid
size: 16,284 kB
sloc: ansic: 105,659; sh: 32,211; xml: 20,360; php: 1,103; perl: 711; makefile: 583; yacc: 548; awk: 133; python: 122; javascript: 27; pascal: 3

file content (73 lines) | stat: -rw-r--r-- 1,916 bytes

parent folder | download | duplicates (8)

#!/usr/bin/perl -wn
#
# utf8map.pl - remap ascii to utf8
#
# Program was created to build conversion table from ascii into utf8.
# Ascii table's first half does not require any changes (because utf8
# [0-127] encoding is the same as in ascii). To control second half's
# encoding we have to specify all unicode codes for characters in
# [128-255] interval. Program takes unicode codes for 128 characters
# on input (in hex format with leading 0x) and generates conversion table .
# Every utf8 code is padded by '0' and occupies 4 bytes.
# It is suitable for use in 'C' programs.
#
# For example,
#  in windows-1257 table character 169 '(c)' has code 0x00A9 in unicode.
#  Program will generate folowing string:
#
#  0xC2, 0xA9, 0x00, 0x00, /*   169           0x00a9 */
# \______________________/     \___/         \______/
#    utf8 code (2 bytes        ascii         unicode
#    with padding)
#
# USAGE:
#   perl utf8map.pl asci_128-255_unicode_table.txt
#
# Andrejs Dubovskis
#

use strict ;

use vars qw/$N/ ;

BEGIN {
  # we going to prepare table for characters in 128-255 interval
  $N = 128 ;
}

# look for hex number (unicode)
for my $hex (/0x[\da-f]+/ig) {
  my $num = hex($hex) ;
  my @out = () ;

  if ($num > 0xffff) {
    die "too large number: $hex" ;
  } elsif ($num > 0x07ff) {
    # result is three bytes long
    @out = (
	    (($num >> 12) & 0xf) | 0xe0,
	    (($num >> 6) & 0x3f) | 0x80,
	    ($num & 0x3f) | 0x80
	   ) ;
  } elsif ($num > 0x7f) {
    # result is two bytes long
    @out = (
	    (($num >> 6) & 0x1f) | 0xc0,
	    ($num & 0x3f) | 0x80
	   ) ;
  } else {
    # only zero is legal here
    die "wrong input data: $hex" if $num ;
  }

  # pad by '0'
  push(@out, 0) while @out < 4 ;

  # output utf8 code
  printf("0x%02X,\t0x%02X,\t0x%02X,\t0x%02X,\t", @out) ;
  # output comments
  print "/*\t$N\t$hex\t*/\n" ;

  # characters in [128-255] interval only
  exit if ++$N > 255 ;
}