File: eucjp_gen.sh

package info (click to toggle)
qtwebengine-opensource-src 5.7.1%2Bdfsg-6.1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 1,028,096 kB
  • ctags: 1,436,736
  • sloc: cpp: 5,960,176; ansic: 3,477,727; asm: 395,492; python: 320,633; sh: 96,504; perl: 69,449; xml: 42,977; makefile: 28,408; objc: 9,962; yacc: 9,847; tcl: 5,430; lex: 2,259; ruby: 1,053; lisp: 522; awk: 497; pascal: 310; cs: 249; sed: 53
file content (109 lines) | stat: -rwxr-xr-x 3,409 bytes parent folder | download | duplicates (19)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/sh
# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

# References:
#   https://encoding.spec.whatwg.org/#euc-jp
#   https://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
#   https://www.iana.org/assignments/charset-reg/CP51932
#   Table 3-64 in CJKV Information Processing 2/e.

# Download the following two files, run it in source/data/mappings directory
# and save the result to euc-jp-html5.ucm
#   https://encoding.spec.whatwg.org/index-jis0208.txt
#   https://encoding.spec.whatwg.org/index-jis0212.txt

function preamble {
cat <<PREAMBLE
# ***************************************************************************
# *
# *   Copyright (C) 1995-2014, International Business Machines
# *   Corporation and others.  All Rights Reserved.
# *
# *   Generated per the algorithm for EUC-JP
# *   described at https://encoding.spec.whatwg.org/#euc-jp.
# *
# ***************************************************************************
<code_set_name>               "euc-jp-html"
<char_name_mask>              "AXXXX"
<mb_cur_max>                  3
<mb_cur_min>                  1
<uconv_class>                 "MBCS"
<subchar>                     \x3F
<icu:charsetFamily>           "ASCII"

<icu:state>                   0-7f, 8e:2, 8f:3, a1-fe:1
<icu:state>                   a1-fe
<icu:state>                   a1-df
<icu:state>                   a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4
<icu:state>                   a1-fe.u

CHARMAP
PREAMBLE
}

#<U0000> \x00 |0
function ascii {
  for i in $(seq 0 127)
  do
    printf '<U%04X> \\x%02X |0\n' $i $i
  done
}


# Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F
function half_width_kana {
  for i in $(seq 0xA1 0xDF)
  do
    # 65377 = 0xFF61, 161 = 0xA1
    printf '<U%04X> \\x8E\\x%02X |0\n' $(($i + 65377 - 161))  $i
  done
}


# index-jis0208.txt has index pointers larger than the size of
# the encoding space available in 2-byte Graphic plane of ISO-2022-based
# encoding (94 x 94 = 8836). We have to exclude them because they're for
# Shift-JIS.
# In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries.
# All the bi-directional mapping entries come *before* the uni-directional
# (EUC-JP to Unicode) entries so that we put '|3' if we have seen
# the same Unicode code point earlier in the list. According to the definition
# of 'index pointer' in the W3C encoding spec, it's the first entry in the
# file for a given Unicode code point.

function jis208 {
  awk '!/^#/ && !/^$/ && $1 <= 8836  \
       { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
                 $1 / 94 + 0xA1, $1 % 94 + 0xA1,\
                 ($2 in uset) ? 3 : 0); \
         uset[$2] = 1;
       }' \
  index-jis0208.txt
}

# JIS X 212 is for decoding only (use '|3' to denote that).

function jis212 {
  awk '!/^#/ && !/^$/ \
       { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\
                 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \
  index-jis0212.txt
}

function unsorted_table {
  ascii
  half_width_kana
  jis208
  jis212
  echo '<U00A5> \x5C |1'
  echo '<U203E> \x7E |1'
  echo '<U2212> \xA1\xDD |1'
}

wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt
wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0212.txt
preamble
unsorted_table | sort  | uniq
echo 'END CHARMAP'