1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
|
% File load-unicode-xetex-classes.tex
%
% Copyright 2015-2019 The LaTeX3 Project
%
% It may be distributed and/or modified under the conditions of
% the LaTeX Project Public License (LPPL), either version 1.3c of
% this license or (at your option) any later version. The latest
% version of this license is in the file
% http://www.latex-project.org/lppl.txt.
%
% Issues with this file should be reported at
% https://github.com/latex3/unicode-data
%
% This file parses EastAsianWidth.txt and LineBreak.txt, provided by the
% Unicode Consortium, and when used with XeTeX sets \XeTeXcharclass for
% the following classes of code point:
% - "ID" (ideographic)
% - "CJ" (conditional Japanese starter)
% - "OP" (opener)
% - "CL" (closer)
% - "NS" (non-starter)
% - "EX" (exclamation)
% - "IS" (infix separator)
% - "CM" (combining marks)
%
% All code points of classes "ID" and "CJ" are assigned to a \XeTeXcharclass,
% but for other classes this only occurs when they fall into east Asian width
% type "F", "H" or "W" (full-, half- and wide-width).
%
% The following mappings between Unicode and XeTeX classes occur
% - "ID" and "CJ" are class 1
% - "OP" is class 2
% - "CL", "NS", "EX", "IS" are class 3
% - "CM" is class 256 (ignored)
% as standard: these may be over-ridden by defining \XeTeXcharclass<class>
% as required. (If classes "ID" or "CL" are explicitly set, the other members
% of the same groups above will inherit these values.)
%
% This file does _not_ activate XeTeX's inter-character token mechanism
% (\XeTeXinterchartokenstate is not set) nor does it install any material in
% the inter-character token registers.
%
% Note that this file is separate from the main loader as the data structure
% here may need more refinement at the macro level.
%
% =============================================================================
%
% The data loaded here can currently only be used by XeTeX: check for the
% appropriate primitive.
\ifx\XeTeXcharclass\undefined
\expandafter\endinput
\fi
% Just in case, check for the e-TeX extensions.
\ifx\eTeXversion\undefined
\expandafter\endinput
\fi
% This file can be loaded in IniTeX mode so the category codes of |{|, |}| and
% |#| may not be correct. Everything is done in a group so that only the
% settings we want to propagate are made available generally.
\begingroup
\catcode`\{=1 %
\catcode`\}=2 %
% Write some basic information to the log.
\catcode`\^=7 %
\newlinechar=`\^^J %
\message{^^J}%
\message{load-unicode-xetex-classes.tex v1.16 (2022-09-17)^^J}%
\message{Reading Unicode east Asian character class data^^J}%
% A string version of |#| will be needed to look for comment lines in the
% source. Once that is done proper parsing can begin.
\catcode`\#=12 %
\def\hash{#}%
\catcode`\#=6 %
\def\firsttoken#1#2\relax{#1}%
\def\parseunicodedataI#1\relax{%
\unless\if\hash\firsttoken#1?\relax
\parseunicodedataII#1\relax
\fi
}%
% Both files to be parsed here have potential ranges of code points: find the
% first entry and search for the second.
\def\parseunicodedataII#1;#2 #3\relax{%
\parseunicodedataIII#1....\relax{#2}%
}%
% From plain: may not be defined (yet).
\def\loop#1\repeat{\def\body{#1}\iterate}%
\def\iterate{%
\body
\let\next\iterate
\else
\let\next\relax
\fi
\next
}%
\let\repeat\fi
% A shared routine for reading the data files: only one part of the parser
% has to be altered.
\def\storedpar{\par}%
\def\readandparse#1{%
\openin0=#1.txt %
% Read two lines from the source file to extract the version information
\catcode`\#=12 %
\read0 to \unicodedataline
\message{\unicodedataline ^^J}%
\read0 to \unicodedataline
\message{\unicodedataline ^^J}%
\loop\unless\ifeof0 %
\read0 to \unicodedataline
\unless\ifx\unicodedataline\storedpar
\expandafter\parseunicodedataI\unicodedataline\relax
\fi
\repeat
\catcode`\#=6 %
\closein0 %
}%
% Set up the different line break classes recognised.
\ifdefined\XeTeXcharclassID
\else
\chardef\XeTeXcharclassID=1 %
\fi
\ifdefined\XeTeXcharclassCJ
\else
\let\XeTeXcharclassCJ\XeTeXcharclassID
\fi
\ifdefined\XeTeXcharclassOP
\else
\chardef\XeTeXcharclassOP=2 %
\fi
\ifdefined\XeTeXcharclassCL
\else
\chardef\XeTeXcharclassCL=3 %
\fi
\ifdefined\XeTeXcharclassEX
\else
\let\XeTeXcharclassEX\XeTeXcharclassCL
\fi
\ifdefined\XeTeXcharclassIS
\else
\let\XeTeXcharclassIS\XeTeXcharclassCL
\fi
\ifdefined\XeTeXcharclassNS
\else
\let\XeTeXcharclassNS\XeTeXcharclassCL
\fi
\ifdefined\XeTeXcharclassCM
\else
\chardef\XeTeXcharclassCM=256 %
\fi
% Check the line break class and if necessary the east Asian width for the
% current code point. For code points of class |ID| or |CJ| there may be a
% range to set, and these are always recorded. In other cases if the code point
% is one of those we may need to set up then save it for checking against the
% list of east Asian widths.
\def\ID{ID}%
\def\CJ{CJ}%
\def\parseunicodedataIII#1..#2..#3\relax#4{%
\def\temp{#4}%
\ifnum 0%
\ifx\temp\ID 1\fi
\ifx\temp\CJ 1\fi
>0 %
\ifx\relax#2\relax
\parseunicodedataIV{#1}{#1}{#4}%
\else
\parseunicodedataIV{#1}{#2}{#4}%
\fi
\else
\ifcsname XeTeXcharclass#4\endcsname
\ifx\relax#2\relax
\expandafter\def\csname LB@\number"#1\endcsname{#4}%
\else
\let\savedbody\body
\count0="#1 %
\loop
\unless\ifnum\count0>"#2 %
\expandafter\def\csname LB@\number\count0 \endcsname{#4}%
\advance\count0 by 1 %
\repeat
\let\body\savedbody
\fi
\fi
\fi
}%
% As we are inside a loop already, there needs to be a group here to preserve
% the iterator.
\def\parseunicodedataIV#1#2#3{%
\begingroup
\count0="#1 %
\loop
\unless\ifnum\count0>"#2 %
\global\XeTeXcharclass\count0=\csname XeTeXcharclass#3\endcsname
\advance\count0 by 1 %
\repeat
\endgroup
}%
\readandparse{LineBreak}%
% For |EastAsianWidth.txt|, action is only needed if the character has width
% |F|, |H| or |W|. Once again there may be a range of characters to handle.
\def\parseunicodedataIII#1..#2..#3\relax#4{%
\ifnum 0%
\if F\firsttoken#4\relax 1\fi
\if H\firsttoken#4\relax 1\fi
\if W\firsttoken#4\relax 1\fi
>0 %
\ifx\relax#2\relax
\parseunicodedataIV{"#1}%
\else
\begingroup
\count0="#1 %
\loop
\unless\ifnum\count0>"#2 %
\parseunicodedataIV{\count0}%
\advance\count0 by 1 %
\repeat
\endgroup
\fi
\fi
}%
% Only take action if a line breaking class was previously saved: that will
% map to the correct class number.
\def\parseunicodedataIV#1{%
\ifcsname LB@\number#1\endcsname
\global\XeTeXcharclass#1=
\csname XeTeXcharclass\csname LB@\number#1\endcsname\endcsname
\fi
}%
\readandparse{EastAsianWidth}%
\endgroup
|