File: CJK.perl

package info (click to toggle)
latex2html 2015-debian1-1
  • links: PTS
  • area: main
  • in suites: stretch
  • size: 7,652 kB
  • ctags: 3,386
  • sloc: perl: 30,941; makefile: 429; sh: 155
file content (231 lines) | stat: -rw-r--r-- 6,760 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

########################################################################
# $Id: CJK.perl,v 1.8 2002/04/26 16:06:52 RRM Exp $
# CJK.perl
#   Jens Lippmann <lippmann@rbg.informatik.tu-darmstadt.de>,
#   Boy Yang <yangboy@math.ntu.edu.tw>,
#   Werner Lemberg <xlwy01@uxp1.hrz.uni-dortmund.de>
#
# Extension to LaTeX2HTML V 96.2 to supply support for the
# "CJK" LaTeX package.
#
########################################################################
# Change Log:
# ===========
#  jcl = Jens Lippmann
#
# $Log: CJK.perl,v $
# Revision 1.8  2002/04/26 16:06:52  RRM
#  --  JIS is EUC-JP, not ISO-2022-JP.
#
# Revision 1.7  2002/04/26 14:17:31  RRM
#  --  fixed MIME names for the encodings; thanks to Jungshik Shin for
#      the correct names
#
# Revision 1.6  2002/04/24 22:27:00  RRM
#  --  automatic recognition of document charset, based upon the
#      encoding in the first {CJK} or {CJK*} environment.
#
# Revision 1.5  1999/06/06 14:24:59  MRO
#
#
# -- many cleanups wrt. to TeXlive
# -- changed $* to /m as far as possible. $* is deprecated in perl5, all
#    occurrences should be removed.
#
# Revision 1.4  1999/04/09 18:11:27  JCL
# changed my e-Mail address
#
# Revision 1.3  1998/02/19 22:24:26  latex2html
# th-darmstadt -> tu-darmstadt
#
# Revision 1.2  1996/12/17 17:11:41  JCL
# typo
#
# Revision 1.1  1996/12/17 17:07:32  JCL
# - introduced to CVS repository
# - adjusted technical notes according to Werner's proposal
# - added support for CJK* environment
#
# jcl  16-DEC-96 - Created
#
########################################################################
# Notes:
# To may view the results only with a browser configured for the
# specific language.
# To configure the browser, use eg. the "document encoding" menu
# of NetScape.
#
# Technical Notes:
# We use the pre_process hook to change any text coming in to
# LaTeX2HTML such that we convert from the outer representation
# of double byte characters to an inner, LaTeX2HTML specific
# representation.
# The two outer representations recognized are described as follows:
# o standard CJK encodings (GB, KS, Big5, SJIS, etc.)
#   Each symbol is formed by two characters, the first in the range
#   [\201-\237\241-\376] (octal) or 0x81-0x9F, 0xA1-0xFE (hexadecimal),
#   the second in the range
#   [\100-\176\200-\377] (octal) or 0x40-0x7E, 0x80-0xFF (hexadecimal).
# o CJK internal encoding (to conveniently use CJK processed files)
#   Each symbol is a sequence with a leading character in the range
#   [\201-\237\241-\376] or 0x81-0x9F, 0xA1-0xFE,
#   a sequence of digits forming the decimal representation of the
#   second character from standard encoded form (eg. "65", "128"),
#   and a trailing 0xFF.
# The internal LaTeX2HTML representation is the same as the CJK
# encoded form.
# Additionally, we handle TeX's normalized representation of special
# characters (eg. ^^e4), which is helpful when LaTeX2HTML processes
# the .aux file.
#
# The post_process hook will convert the LaTeX2HTML internal coding
# into standard Big5/SJIS encoding, which then remains in the
# HTML text.
#
# The revert_to_raw_tex hook will convert the internal encoding
# back to standard encoding to help with image creation.
#
########################################################################


package main;

# possible values for the 1st optional argument to \begin{CJK}
# and the corresponding charset:

%CJK_charset = (
	  'Bg5'    , 'Big5'
	, 'Bg5+'   , 'Big5Plus'
	, 'Bg5hk'  , 'Big5-HKSCS'
	, 'GB'     , 'gb2312'
	, 'GBt'    , 'gbt_12345'
	, 'GBK'    , 'GBK'
#	, 'JIS'    , 'ISO-2022-JP'
	, 'JIS'    , 'EUC-JP'
	, 'SJIS'   , 'Shift_JIS'
	, 'KS'     , 'EUC-KR'
	, 'UTF8'   , 'UTF-8'
	, 'EUC-TW' , 'X-EUC-TW'
	, 'EUC-JP' , 'EUC-JP'
	, 'EUC-KR' , 'EUC-KR'
	, 'CP949'  , 'X-Windows-949'
);

# Use 'Bg5' => 'big5' as default charset, for both input and output,
# unless it is set already with a value for  $CJK_AUTO_CHARSET

$CJK_AUTO_CHARSET = '' unless (defined $CJK_AUTO_CHARSET);
$charset = $CHARSET = $CJK_AUTO_CHARSET || $CJK_charset{'Bg5'};


sub pre_pre_process {
    # Handle TeX's normalized special character encoding.
    # This *might* be done by LaTeX2HTML, too, but yet we don't
    # rely on it.
    s/\^\^([^0-9a-f])/chr((64+ord($1))&127)/gem;
    s/\^\^([0-9a-f][0-9a-f])/chr(hex($1))/gem;
    # Care for standard CJK encoding -> l2h internal form.
    s/([\201-\237\241-\376])([\100-\176\200-\376])/"$1" . ord($2) . "\377"/gem;
}

sub post_post_process {
    # l2h internal form -> standard CJK encoding
    s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge;
}

sub revert_to_raw_tex_hook {
    # l2h internal form -> standard CJK encoding
    s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge;
}


sub do_cmd_CJKchar {
    local($_) = @_;
    &get_next_optional_argument;
    s/$next_pair_rx/chr($2)/eo;
    s/$next_pair_rx/$2\377/o;
    $_;
}

# Handle CJK environments.
# The usage of \CJKspace, \CJKnospace is not implemented yet.
#
sub do_env_CJK {
    local($_) = @_;
    my ($cjk_enc);
    # skip font encoding
    &get_next_optional_argument;

    # handle CJK encoding
    $cjk_enc = &missing_braces unless 
	((s/$next_pair_pr_rx/$cjk_enc = $2; ''/eo)
	||(s/$next_pair_rx/$cjk_enc = $2; ''/eo));
    $cjk_enc =~ s/^\s+|\s+$//g;
    if ($cjk_enc) {
	if (!defined $CJK_charset{$cjk_enc}) {
	    &write_warning ( "unknown charset code: $cjk_enc in CJK environment.");
	} elsif (!$CJK_AUTO_CHARSET) {
	    $CJK_AUTO_CHARSET = $charset = $CHARSET = $CJK_charset{$cjk_enc};
	} elsif ($CHARSET eq $CJK_charset{$cjk_enc}) {
	    # compatible; do nothing.
	} else {
	    &write_warning ( "Only one charset allowed per document: $CHARSET");
	    &write_warning ( "Ignoring request for ".$CJK_charset{$cjk_enc});
	}
    }
    
    # skip CJK font family
    s/$next_pair_rx//o;
    $_;
}

# Handle CJK* environments.
# The usage of \CJKspace, \CJKnospace is not implemented yet.
# We won't catch single newlines following CJK symbols, because
# this would require to suppress the newlines in the HTML output,
# leading to overly long lines.
#
sub do_env_CJKstar {
    local($_) = &do_env_CJK;
    #CJK symbols eat ensuing white space
    s/([\201-\237\241-\376]\d+\377)[ \t]+/\1/g;
    $_;
}

# most of the commands here need some action which is not implemented yet.

&ignore_commands(<<_IGNORED_CMDS_);
CJKCJKchar
CJKboldshift
CJKcaption # {}
CJKenc # {}
CJKencfamily # [] # {} # {}
CJKfamily # {}
CJKfontenc # {} # {}
CJKglue
CJKhangul
CJKhangulchar
CJKhanja
CJKkern
CJKlatinchar
CJKnospace
CJKspace
CJKtilde
CJKtolerance
CJKuppercase
Unicode # {} # {}
nbs
standardtilde
_IGNORED_CMDS_


# we need \AtBeginDocument and \AtEndDocument

&ignore_commands(<<_IGNORED_CMDS_);
AtBeginDocument # {}
AtEndDocument # {}
_IGNORED_CMDS_

# This must be the last line.
1;