1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
|
########################################################################
# $Id: CJK.perl,v 1.8 2002/04/26 16:06:52 RRM Exp $
# CJK.perl
# Jens Lippmann <lippmann@rbg.informatik.tu-darmstadt.de>,
# Boy Yang <yangboy@math.ntu.edu.tw>,
# Werner Lemberg <xlwy01@uxp1.hrz.uni-dortmund.de>
#
# Extension to LaTeX2HTML V 96.2 to supply support for the
# "CJK" LaTeX package.
#
########################################################################
# Change Log:
# ===========
# jcl = Jens Lippmann
#
# $Log: CJK.perl,v $
# Revision 1.8 2002/04/26 16:06:52 RRM
# -- JIS is EUC-JP, not ISO-2022-JP.
#
# Revision 1.7 2002/04/26 14:17:31 RRM
# -- fixed MIME names for the encodings; thanks to Jungshik Shin for
# the correct names
#
# Revision 1.6 2002/04/24 22:27:00 RRM
# -- automatic recognition of document charset, based upon the
# encoding in the first {CJK} or {CJK*} environment.
#
# Revision 1.5 1999/06/06 14:24:59 MRO
#
#
# -- many cleanups wrt. to TeXlive
# -- changed $* to /m as far as possible. $* is deprecated in perl5, all
# occurrences should be removed.
#
# Revision 1.4 1999/04/09 18:11:27 JCL
# changed my e-Mail address
#
# Revision 1.3 1998/02/19 22:24:26 latex2html
# th-darmstadt -> tu-darmstadt
#
# Revision 1.2 1996/12/17 17:11:41 JCL
# typo
#
# Revision 1.1 1996/12/17 17:07:32 JCL
# - introduced to CVS repository
# - adjusted technical notes according to Werner's proposal
# - added support for CJK* environment
#
# jcl 16-DEC-96 - Created
#
########################################################################
# Notes:
# To may view the results only with a browser configured for the
# specific language.
# To configure the browser, use eg. the "document encoding" menu
# of NetScape.
#
# Technical Notes:
# We use the pre_process hook to change any text coming in to
# LaTeX2HTML such that we convert from the outer representation
# of double byte characters to an inner, LaTeX2HTML specific
# representation.
# The two outer representations recognized are described as follows:
# o standard CJK encodings (GB, KS, Big5, SJIS, etc.)
# Each symbol is formed by two characters, the first in the range
# [\201-\237\241-\376] (octal) or 0x81-0x9F, 0xA1-0xFE (hexadecimal),
# the second in the range
# [\100-\176\200-\377] (octal) or 0x40-0x7E, 0x80-0xFF (hexadecimal).
# o CJK internal encoding (to conveniently use CJK processed files)
# Each symbol is a sequence with a leading character in the range
# [\201-\237\241-\376] or 0x81-0x9F, 0xA1-0xFE,
# a sequence of digits forming the decimal representation of the
# second character from standard encoded form (eg. "65", "128"),
# and a trailing 0xFF.
# The internal LaTeX2HTML representation is the same as the CJK
# encoded form.
# Additionally, we handle TeX's normalized representation of special
# characters (eg. ^^e4), which is helpful when LaTeX2HTML processes
# the .aux file.
#
# The post_process hook will convert the LaTeX2HTML internal coding
# into standard Big5/SJIS encoding, which then remains in the
# HTML text.
#
# The revert_to_raw_tex hook will convert the internal encoding
# back to standard encoding to help with image creation.
#
########################################################################
package main;
# possible values for the 1st optional argument to \begin{CJK}
# and the corresponding charset:
%CJK_charset = (
'Bg5' , 'Big5'
, 'Bg5+' , 'Big5Plus'
, 'Bg5hk' , 'Big5-HKSCS'
, 'GB' , 'gb2312'
, 'GBt' , 'gbt_12345'
, 'GBK' , 'GBK'
# , 'JIS' , 'ISO-2022-JP'
, 'JIS' , 'EUC-JP'
, 'SJIS' , 'Shift_JIS'
, 'KS' , 'EUC-KR'
, 'UTF8' , 'UTF-8'
, 'EUC-TW' , 'X-EUC-TW'
, 'EUC-JP' , 'EUC-JP'
, 'EUC-KR' , 'EUC-KR'
, 'CP949' , 'X-Windows-949'
);
# Use 'Bg5' => 'big5' as default charset, for both input and output,
# unless it is set already with a value for $CJK_AUTO_CHARSET
$CJK_AUTO_CHARSET = '' unless (defined $CJK_AUTO_CHARSET);
$charset = $CHARSET = $CJK_AUTO_CHARSET || $CJK_charset{'Bg5'};
sub pre_pre_process {
# Handle TeX's normalized special character encoding.
# This *might* be done by LaTeX2HTML, too, but yet we don't
# rely on it.
s/\^\^([^0-9a-f])/chr((64+ord($1))&127)/gem;
s/\^\^([0-9a-f][0-9a-f])/chr(hex($1))/gem;
# Care for standard CJK encoding -> l2h internal form.
s/([\201-\237\241-\376])([\100-\176\200-\376])/"$1" . ord($2) . "\377"/gem;
}
sub post_post_process {
# l2h internal form -> standard CJK encoding
s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge;
}
sub revert_to_raw_tex_hook {
# l2h internal form -> standard CJK encoding
s/([\201-\237\241-\376])(\d+)\377/"$1" . chr($2)/ge;
}
sub do_cmd_CJKchar {
local($_) = @_;
&get_next_optional_argument;
s/$next_pair_rx/chr($2)/eo;
s/$next_pair_rx/$2\377/o;
$_;
}
# Handle CJK environments.
# The usage of \CJKspace, \CJKnospace is not implemented yet.
#
sub do_env_CJK {
local($_) = @_;
my ($cjk_enc);
# skip font encoding
&get_next_optional_argument;
# handle CJK encoding
$cjk_enc = &missing_braces unless
((s/$next_pair_pr_rx/$cjk_enc = $2; ''/eo)
||(s/$next_pair_rx/$cjk_enc = $2; ''/eo));
$cjk_enc =~ s/^\s+|\s+$//g;
if ($cjk_enc) {
if (!defined $CJK_charset{$cjk_enc}) {
&write_warning ( "unknown charset code: $cjk_enc in CJK environment.");
} elsif (!$CJK_AUTO_CHARSET) {
$CJK_AUTO_CHARSET = $charset = $CHARSET = $CJK_charset{$cjk_enc};
} elsif ($CHARSET eq $CJK_charset{$cjk_enc}) {
# compatible; do nothing.
} else {
&write_warning ( "Only one charset allowed per document: $CHARSET");
&write_warning ( "Ignoring request for ".$CJK_charset{$cjk_enc});
}
}
# skip CJK font family
s/$next_pair_rx//o;
$_;
}
# Handle CJK* environments.
# The usage of \CJKspace, \CJKnospace is not implemented yet.
# We won't catch single newlines following CJK symbols, because
# this would require to suppress the newlines in the HTML output,
# leading to overly long lines.
#
sub do_env_CJKstar {
local($_) = &do_env_CJK;
#CJK symbols eat ensuing white space
s/([\201-\237\241-\376]\d+\377)[ \t]+/\1/g;
$_;
}
# most of the commands here need some action which is not implemented yet.
&ignore_commands(<<_IGNORED_CMDS_);
CJKCJKchar
CJKboldshift
CJKcaption # {}
CJKenc # {}
CJKencfamily # [] # {} # {}
CJKfamily # {}
CJKfontenc # {} # {}
CJKglue
CJKhangul
CJKhangulchar
CJKhanja
CJKkern
CJKlatinchar
CJKnospace
CJKspace
CJKtilde
CJKtolerance
CJKuppercase
Unicode # {} # {}
nbs
standardtilde
_IGNORED_CMDS_
# we need \AtBeginDocument and \AtEndDocument
&ignore_commands(<<_IGNORED_CMDS_);
AtBeginDocument # {}
AtEndDocument # {}
_IGNORED_CMDS_
# This must be the last line.
1;
|