# Perl5 script to make a mapping table between Unicode and Non-Unicode character sets.
# Invoke as:
#   perl ucs-to-iso.pl /path-to/UnicodeData-Latest.txt /path-to-text-mapping-datas/*.{TXT,xml}
# or
#   perl -s ucs-to-iso.pl -what=<WHAT> /path-to/UnicodeData-Latest.txt /path-to-text-mapping-datas/*.{TXT,xml}
# where <WHAT> is comma separted list of words:
#   cf, ces, cv, u2i, pool, i2u, noa, j1, f2h, h2f, uprop, or iprop.

my $beginning = time;

BEGIN {
  require 5;
  require 'ucs-to-iso-lib.pl';
}

$what = 'cf,ces,cv,u2i,pool,i2u,noa,j1,f2h,h2f,uprop,iprop' if (!defined($what));
$info = 1 if (!defined($info));

if ($info) {
  select((select(STDERR), $| = 1)[0]);

  $info = sub {
    my $text = sprintf $_[0], @_[1 .. $#_];
    my $now = time - $beginning;

    foreach (split(/\n/, $text)) {
      printf STDERR "%02d:%02d:%02d: %s\n", int($now / 3600), int(($now % 3600) / 60), $now % 60, $_;
    }
  };
}
else {
  $info = sub {1;};
}

my %what = map {$_ => 1} split(/,/, $what);

$cf_h = $what{cf} ? 'mbcesconf.h' : '/dev/null' if (!defined($cf_h));
$ces_c = $what{ces} ? 'mbces.c' : '/dev/null' if (!defined($ces_c));
$ces_h = $what{ces} ? 'mbces.h' : '/dev/null' if (!defined($ces_h));
$cestab_h = $what{ces} ? 'mbcestab.mk_btri.h' : '/dev/null' if (!defined($cestab_h));
$cesdef_h = $what{ces} ? 'mbcesdefs.h' : '/dev/null' if (!defined($cesdefs_h));
$cvt_h = $what{ces} ? 'mbconvtab.mk_btri.h' : '/dev/null' if (!defined($cvt_h));
$u2i_h = $what{u2i} ? 'ucs-to-iso.mk_btri.h' : '/dev/null' if (!defined($u2i_h));
$u2i_pool_h = $what{u2i} ? 'ucs-to-iso-pool.h' : '/dev/null' if (!defined($u2i_pool_h));
$i2u_h = $what{i2u} ? 'iso-to-ucs.mk_btri.h' : '/dev/null' if (!defined($i2u_h));
$noa_h = $what{noa} ? 'notascii.mk_btri.h' : '/dev/null' if (!defined($noa_h));
$j1_h = $what{j1} ? 'jis1flag.mk_btri.h' : '/dev/null' if (!defined($j1_h));
$f2h_h = $what{f2h} ? 'full-to-half.mk_btri.h' : '/dev/null' if (!defined($f2h_h));
$h2f_h = $what{h2f} ? 'half-to-full.mk_btri.h' : '/dev/null' if (!defined($h2f_h));
$prop_h = $what{uprop} ? 'prop.mk_btri.h' : '/dev/null' if (!defined($prop_h));

my $udata = shift(@ARGV);
my ($map, $notascii_map, $jis1map) = &make_map($info);

local (*UD, *U, *U_PO, *I, *I_PO, *NOA, *J1, *F2H, *H2F, *PROP, *CVT, *CD, *CT, *CH, *CC, *CF);

open(UD, $udata) || die "open(UD, \"$udata\"): $!";
open(U, ">$u2i_h") || die "open(U, \">$u2i_h\"): $!";
open(U_PO, ">$u2i_pool_h") || die "open(U_PO, \">$u2i_pool_h\"): $!";
open(I, ">$i2u_h") || die "open(I, \">$i2u_h\"): $!";
open(NOA, ">$noa_h") || die "open(NOA, \">$noa_h\"): $!";
open(J1, ">$j1_h") || die "open(J1, \">$j1_h\"): $!";
open(F2H, ">$f2h_h") || die "open(F2H, \">$f2h_h\"): $!";
open(H2F, ">$h2f_h") || die "open(H2F, \">$h2f_h\"): $!";
open(PROP, ">$prop_h") || die "open(PROP, \">$prop_h\"): $!";
open(CVT, ">$cvt_h") || die "open(CVT, \">$cvt_h\"): $!";
open(CD, ">$cesdef_h") || die "open(CD, \">$cesdef_h\"): $!";
open(CT, ">$cestab_h") || die "open(CT, \">$cestab_h\"): $!";
open(CH, ">$ces_h") || die "open(CH, \">$ces_h\"): $!";
open(CC, ">$ces_c") || die "open(CC, \">$ces_c\"): $!";
open(CF, ">$cf_h") || die "open(CF, \">$cf_h\"): $!";

my $glob;

foreach $glob ((\*U, \*I, \*NOA, \*J1, \*F2H, \*H2F, \*PROP)) {
  print $glob <<EOF;
%%TYPE number
%%BEGIN

EOF
}

$info->("Prepare Unicode => CES mapping");

my @ces = CES->all;
my ($ui, $ucs, $pool, $last, $cprop, @iso, %i2u_pool, $iso, @uptab, @h2f, @f2h);

$info->("CJK table");

my @iptab =
  sort {
    $a->[0] <=> $b->[0];
  } (
     (
      map {
	[&MB_WORD_94x94_ENC($_, 0*94),
	 &MB_WORD_94x94_ENC($_, 94*94-1),
	 MB_CPROP_MAY_BREAK | MB_CPROP_EOL_TO_NULL];
      } (0x40 .. 0x42, 0x44 .. 0x5F)
      ),
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x21 - 0x21, 0x21 - 0x21)),
      &MB_WORD_94x94_ENC(0x43, &dbc2c(0x23 - 0x21, 0x7E - 0x21)),
      MB_CPROP_MAY_BREAK | MB_CPROP_EOL_TO_NULL],
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x24 - 0x21, 0x21 - 0x21)),
      &MB_WORD_94x94_ENC(&dbc2c(0x43, 0x24 - 0x21, 0x7E - 0x21)),
      MB_CPROP_MAY_BREAK],
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x25 - 0x21, 0x21 - 0x21)),
      &MB_WORD_94x94_ENC(0x43, &dbc2c(0x28 - 0x21, 0x30 - 0x21)),
      MB_CPROP_MAY_BREAK | MB_CPROP_EOL_TO_NULL],
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x28 - 0x21, 0x31 - 0x21)),
      &MB_WORD_94x94_ENC(0x43, &dbc2c(0x28 - 0x21, 0x4C - 0x21)),
      MB_CPROP_MAY_BREAK],
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x28 - 0x21, 0x4D - 0x21)),
      &MB_WORD_94x94_ENC(0x43, &dbc2c(0x29 - 0x21, 0x30 - 0x21)),
      MB_CPROP_MAY_BREAK | MB_CPROP_EOL_TO_NULL],
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x29 - 0x21, 0x31 - 0x21)),
      &MB_WORD_94x94_ENC(0x43, &dbc2c(0x29 - 0x21, 0x4C - 0x21)),
      MB_CPROP_MAY_BREAK],
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x29 - 0x21, 0x4D - 0x21)),
      &MB_WORD_94x94_ENC(0x43, &dbc2c(0x2F - 0x21, 0x7E - 0x21)),
      MB_CPROP_MAY_BREAK | MB_CPROP_EOL_TO_NULL],
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x30 - 0x21, 0x21 - 0x21)),
      &MB_WORD_94x94_ENC(0x43, &dbc2c(0x48 - 0x21, 0x7E - 0x21)),
      MB_CPROP_MAY_BREAK],
     [&MB_WORD_94x94_ENC(0x43, &dbc2c(0x49 - 0x21, 0x21 - 0x21)),
      &MB_WORD_94x94_ENC(0x43, &dbc2c(0x7E - 0x21, 0x7E - 0x21)),
      MB_CPROP_MAY_BREAK | MB_CPROP_EOL_TO_NULL],
     (map {
       my $fc = &{'MB_' . $_ . '_FC'};

       [&MB_WORD_DBC_ENC($fc, 0), &MB_WORD_DBC_ENC($fc, MB_DBC_UNIT - 1), MB_CPROP_MAY_BREAK | MB_CPROP_EOL_TO_NULL];
     } qw(BIG5 EUC_TW)),
     (map {
       my $fc = &{'MB_' . $_ . '_FC'};

       [&MB_WORD_DBC_ENC($fc, 0), &MB_WORD_DBC_ENC($fc, MB_DBC_UNIT - 1), MB_CPROP_MAY_BREAK];
     } qw(JOHAB UHANG)),
     );

while (defined($line = <UD>)) {
  my ($hex, $name, $type, $fh) = (split(/\;/, $line))[0, 1, 2, 5];

  ($last, $cprop) = (hex($hex));

  if ($name =~ /^<.*First>$/i) {
    $ucs = $last;
  }
  else {
    $ucs = $last if ($name !~ /^<.*Last>$/i);

    if ($name =~ /\bHANGUL\b/i) {
      $cprop |= MB_CPROP_MAY_BREAK;
    }
    elsif ($name =~ /\b(CJK|HIRAGANA|KATAKANA|KANA|FULLWIDTH|IDEOGRAPHIC)\b/i) {
      $cprop |= MB_CPROP_MAY_BREAK | MB_CPROP_EOL_TO_NULL;
    }

    if ($type =~ /^Z/) {
      $cprop |= MB_CPROP_IS_SPACE | MB_CPROP_MAY_BREAK if ($name !~ /\bNO-BREAK\s*SPACE\b/i);
    }
    elsif ($type =~ /^P/) {
      if ($type =~ /[is]$/) {
	$cprop |= MB_CPROP_NEVER_EOL;
      }
      elsif ($type =~ /[fe]$/ || $name =~ /(FULL\s*STOP|COMMA)$/i) {
	$cprop |= MB_CPROP_NEVER_BOL;
      }
    }
    elsif ($type =~ /^M/) {
      $cprop = MB_CPROP_NEVER_BOL;
    }

    push(@uptab, [$ucs, $last, $cprop]) if (defined($cprop));
    printf F2H "0x%X-0x%X,0x%X\n", $ucs, $last, hex($1) if ($fh =~ /^<wide>\s+([0-9A-Fa-f]+)$/);
    printf H2F "0x%X-0x%X,0x%X\n", $ucs, $last, hex($1) if ($fh =~ /^<narrow>\s+([0-9A-Fa-f]+)$/);

    for (; $ucs <= $last ; ++$ucs) {
      $info->("Processing U+%X", $ucs) unless ($ucs & 0xFFF);

      if (ref($map->[$ucs]) eq 'HASH') {
	@iso = sort {$a <=> $b} grep {$_ ne ''} keys %{$map->[$ucs]};
	printf U "0x%X,%uU\n", $ucs, $pool if (@iso && $ucs >= 0x80);

	while (@iso) {
	  $iso = shift(@iso);
	  &add_to_range_tab(($iso & ~MB_NON_UCS_MARK) + MB_NON_UCS_LOWER,
			    ($iso & ~MB_NON_UCS_MARK) + MB_NON_UCS_LOWER,
			    $cprop, \@iptab) if (defined($cprop));

	  if ($ucs >= 0x80) {
	    printf U_PO "0x%X,\n", ($iso | (@iso ? 0 : MB_U2I_POOL_LAST_MARK));
	    ++$pool;
	    printf I "0x%X,0x%X\n", ($iso & ~MB_NON_UCS_MARK) + MB_NON_UCS_LOWER, $ucs;
	  }
	}

	if ($ucs >= 0x80 && $ucs < @$notascii_map && ref($notascii_map->[$ucs]) eq 'HASH') {
	  printf NOA "0x%X,1\n", $iso while (($iso) = each %{$notascii_map->[$ucs]});
	}
      }
    }
  }
}

printf J1 "0x%X,0x%X\n", $jis, $flag while (($jis, $flag) = each %$jis1map);

foreach $ptab (\@uptab, \@iptab) {
  if (@$ptab) {
    ($code, $last, $cprop) = @{$ptab->[0]}[0, 1, $#{$ptab->[0]}];

    for ($i = 1 ; $i < @$ptab ; ++$i) {
      $elem = $ptab->[$i];

      if ($last + 1 == $elem->[0] && $cprop == $elem->[$#$elem]) {
	$last = $elem->[1];
      }
      else {
	printf PROP "0x%X-0x%X,0x%X\n", $code, $last, $cprop;
	($code, $last, $cprop) = @{$elem}[0, 1, $#$elem];
      }
    }

    printf PROP "0x%X-0x%X,0x%X\n", $code, $last, $cprop;
  }
}

close(PROP);

my ($c, $h) = get_C_H;

print CC "#include \"mb.h\"\n", $c;
print CH $h;
print CT "#include \"$cf_h\"\n%%BEGIN default_ces_tab\n";

print CVT join("\n",
	       "#include \"$cf_h\"",
	       "#include \"mblangconf.h\"",
	       '%%BEGIN default_conv_tab',
	       '"ascii",mb_conv_ascii',
	       '"ces",mb_conv_for_ces',
	       '#ifdef USE_UCS',
	       '"to-ucs",mb_conv1_to_ucs',
	       '"full-to-half",mb_conv_f2h',
	       '"f2h",mb_conv_f2h',
	       '"half-to-full",mb_conv_h2f',
	       '"h2f",mb_conv_h2f',
	       '"jisx0213",mb_conv_to_jisx0213',
	       '"jisx0213-aggressive",mb_conv_to_jisx0213_aggressive',
	       '#ifdef USE_WIN1252',
	       '"ms-latin1",mb_conv_ms_latin1',
	       '#endif',
	       '"jis0208-to-ucs-extra",mb_jisx0208_to_ucs_extra',
	       '"ucs-to-jis0208-extra",mb_ucs_to_jisx0208_extra',
	       '#endif',
	       '');

my ($gn, $ces, @cesdef, %ifdefname);
my %auto_detect_factor = qw(UTF16 2
			    UTF16BE 2
			    UTF16LE 2
			    );

foreach $ces (@ces) {
  my $id = $ces->id;
  my ($tc, $th, $tt);

  $info->("Making CES: $id");
  ($c, $h) = ($ces->c, $ces->h);
  ($tc, $th) = &make_decmap($id, @{$ces->decmap_spec});
  $c .= $tc;
  $h .= $th;

  my $cv = $ces->converters;
  my $g = $ces->G;
  my $ifdefname;

  if ($g->[1] !~ /^mb_(G[0-9nN]|UTF8|MOEINTERNAL)$/) {
    $ifdefname = $g->[1];
    $ifdefname =~ s/^mb_//;
    $ifdefname = sprintf($ifdefname, $id);
  }

  if (ref($cv) eq 'ARRAY' && grep(/%s/, @$cv)) {
    $c .= join("\n",
	       '#ifdef USE_UCS',
	       'mb_wchar_t',
	       "mb_conv_to_$id(mb_wchar_t enc, mb_ces_t *ces)",
	       '{',
	       "return mb_conv_for_decoder(enc, &mb_decmap_$id);",
	       '}',
	       'size_t',
	       "mb_conv_ws_to_$id(mb_wchar_t *ws, mb_wchar_t *ws_end, mb_info_t *info)",
	       '{',
	       "return mb_conv_for_specific_ces(ws, ws_end, &mb_ces_$id);",
	       '}',
	       '#endif',
	       '',
	       );

    $h .= join("\n",
	       '#ifdef USE_UCS',
	       "extern mb_wchar_t mb_conv_to_$id(mb_wchar_t, mb_ces_t *);",
	       "extern size_t mb_conv_ws_to_$id(mb_wchar_t *, mb_wchar_t *, mb_info_t *);",
	       '#endif',
	       '');
    print CVT join("\n",
		   ($ifdefname ne '' ? "#if defined(USE_$ifdefname) && defined(USE_UCS)" : "#ifdef USE_UCS"),
		   (map {sprintf('"to-%s",mb_conv_ws_to_%s', $_, $id)} @{$ces->names}),
		   '#endif',
		   '');
  }

  my $dd = $ces->default_decoder;

  $c .= join("\n",
	     sprintf('static const char *mb_namev_%s[] = {', $id),
	     (map {"\"$_\",";} @{$ces->names}),
	     '};',
	     (ref($cv) eq 'ARRAY' ?
	      (sprintf('static mb_conv_t mb_convv_%s[] = {', $id),
	       '#ifdef USE_UCS',
	       (map {sprintf($_,$id) . ','} @$cv),
	       '#else',
	       (map {sprintf($_,$id) . ','} grep(!/%s/, @$cv)),
	       '#endif',
	       'NULL,',
	       '};') :
	      ()),
	     "mb_ces_t mb_ces_$id = {",
	     "mb_namev_$id,",
	     sprintf('%s,%s,', @{$ces->flag}),
	     sprintf('{%s,%s,{%s,%s,%s,%s},{%s,%s,%s,%s}},',
		     (map {sprintf($_,$id)} @{$g}[0,1]),
		     (map {defined($_)?$_:'mb_nSETs'} @{$g->[2]}[0..3]),
		     (map {sprintf("0x%02X",$_)} @{$g->[3]}[0..3])),
	     (ref($cv) eq 'ARRAY' ? "mb_convv_$id," : qw(NULL,)),
	     "&mb_decmap_$id,",
	     defined($dd)?sprintf($dd,$id).',':'NULL,',
	     defined($auto_detect_factor{$id})?$auto_detect_factor{$id}.',':'1,',
	     '};',
	     '');

  $h .= "extern mb_ces_t mb_ces_$id;\n";
  $tt = join("\n", (map {"\"$_\",&mb_ces_$id"} @{$ces->names}), '');

  my $encmap = $ces->encmap_spec;

  if (ref($encmap) eq 'ARRAY') {
    ($tc, $th) = &make_encmap($id, @$encmap);
    $c .= $tc;
    $h .= $th;
  }

  if ($id !~ /^(ASCII|UTF(8|16([BL]E)?)|MOEINTERNAL)$/) {
    foreach ('MB_' . $id . '_FC', 'MB_' . $id . '_FC_MAX') {
      $h .= sprintf("#define %s (0x%02X)\n", $_, &{$_}) if (defined(&{$_}));
    }

    if ($ifdefname ne '') {
      unless ($ifdefname{$ifdefname}) {
	$ifdefname{$ifdefname} = 1;
	print CF '#define USE_', $ifdefname, "\n";
      }

      $c = "#ifdef USE_$ifdefname\n$c#endif\n";
      $h = "#ifdef USE_$ifdefname\n$h#endif\n";
      $tt = "#ifdef USE_$ifdefname\n$tt#endif\n";
      print CD "#ifdef USE_$ifdefname\ndefces($id, $ifdefname)\n#endif\n";
    }
  }

  print CC $c;
  print CH $h;
  print CT $tt;
}

close(CF);
close(CC);
close(CH);
close(CT);
close(CD);
close(CVT);
close(H2F);
close(F2H);
close(J1);
close(NOA);
close(I);
close(U);

exit;

sub bsearch_range {
  my ($key, $tab) = @_;
  my ($b, $e, $i, $elem, $cmp);

  for ($b = 0, $e = @$tab ; $b < $e ;) {
    $elem = $tab->[$i = int(($b + $e) / 2)];

    if ($key < $elem->[0]) {
      $e = $i;
    }
    elsif ($key > $elem->[1]) {
      $b = $i + 1;
    }
    else {
      return (0, $i);
    }
  }

  (-1, $e);
}

sub add_to_range_tab {
  my ($min, $max, $val, $tab) = @_;
  my ($next, $i);

  (undef, $i) = &bsearch_range($min, $tab);

  for (; $i < @$tab ; ++$i) {
    $next = $tab->[$i];

    if ($min < $next->[0]) {
      if ($max < $next->[0]) {
	splice(@$tab, $i, 0, [$min, $max, $val]);
	return;
      }
      elsif ($#$next == 2 && $next->[2] == $val) {
	$next->[0] = $min;
      }
      else {
	splice(@$tab, $i, 0, [$min, $next->[0] - 1, $val]);
	$min = $next->[0];
	++$i;
      }
    }
    elsif ($min > $next->[0]) {
      if ($#$next == 2 && $next->[2] == $val) {
	$min = $next->[0];
      }
      else {
	splice(@$tab, $i, 0, [$next->[0], $min - 1, @{$next}[2 .. $#$next]]);
	$next->[0] = $min;
	++$i;
      }
    }

    if ($max < $next->[1]) {
      if ($#$next != 2 || $next->[2] != $val) {
	splice(@$tab, $i, 0, [$min, $max, grep($_ != $val, @{$next}[2 .. $#$next]), $val]);
	$next->[0] = $max + 1;
      }

      return;
    }

    my @diff = grep($_ != $val, splice(@$next, 2));

    push(@$next, @diff, $val);
    return if ($max == $next->[1]);
    $min = $next->[1] + 1;
  }

  push(@$tab, [$min, $max, $val]) if ($min <= $max);
}

sub make_range_tab {
  my (@result, $i);

  &add_to_range_tab(@{shift(@_)}[0,1,2], \@result) while (@_);
  \@result;
}
