File: nls_txt_to_in.java

package info (click to toggle)
clisp 1999-07-22-5
  • links: PTS
  • area: main
  • in suites: potato
  • size: 36,876 kB
  • ctags: 19,900
  • sloc: ansic: 76,750; lisp: 65,522; asm: 16,504; sh: 8,971; fortran: 8,277; makefile: 3,251; objc: 2,481; perl: 1,744; java: 553; sed: 96
file content (74 lines) | stat: -rw-r--r-- 2,462 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/**
 * Program for converting character set descriptions as those found on
 * ftp.unicode.org to .in files containing one character per line.
 *
 * @author Bruno Haible
 */
import java.io.*;
import java.util.*;
public class nls_txt_to_in {
  static String toHexString1 (int i) {
    return new String(new char[] { "0123456789abcdef".charAt(i) });
  }
  static String toHexString2 (int i) {
    return  toHexString1((i>>4)&0x0f)
           +toHexString1(i&0x0f);
  }
  static String toHexString4 (int i) {
    return  toHexString1((i>>12)&0x0f)
           +toHexString1((i>>8)&0x0f)
           +toHexString1((i>>4)&0x0f)
           +toHexString1(i&0x0f);
  }
  public static void main (String[] args) throws IOException {
    if (args.length != 0)
      System.exit(1);
    int[] charset = new int[256];
    // Unassigned entries are mapped to 0xfffd.
    for (int i = 0; i < 256; i++)
      charset[i] = 0xfffd;
    // Some of the .TXT files lack the ASCII characters.
    for (int i = 0; i < 128; i++)
      charset[i] = i;
    // Read and interpret the .TXT file line by line.
    {
      BufferedReader stream = new BufferedReader(new InputStreamReader(System.in));
      for (;;) {
        String line = stream.readLine();
        if (line == null)
          break;
        if (line.indexOf('#') >= 0)
          line = line.substring(0,line.indexOf('#'));
        StringTokenizer tokenizer = new StringTokenizer(line," \t",false);
        if (!tokenizer.hasMoreTokens()) continue;
        String token1 = tokenizer.nextToken();
        if (!tokenizer.hasMoreTokens()) continue;
        String token2 = tokenizer.nextToken();
        if (token1.startsWith("0x") && token2.startsWith("0x")) {
          try {
            int num1 = Integer.parseInt(token1.substring(2),16);
            int num2 = Integer.parseInt(token2.substring(2),16);
            charset[num1] = num2;
          } catch (NumberFormatException e) {
            System.err.println("unknown tokens: "+token1+" "+token2);
          }
        } else
          System.err.println("unknown tokens: "+token1+" "+token2);
      }
      stream.close();
    }
    // Output the charset.
    {
      PrintStream stream = System.out;
      for (int i = 0; i < 256; i++)
        if (i != 10 && i != 13) {
          int j = charset[i];
          if (j < 128)
            stream.println((char)j);
          else
            stream.println("\\u"+toHexString4(j));
        }
    }
    System.exit(0);
  }
}