1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
|
#!/usr/bin/env texlua
--------------------------------------------------------------------------------
-- FILE: transliterator.lua
-- USAGE: to be called by t-transliterator.mkiv
-- DESCRIPTION: basic lua environment for the Transliterator module
-- REQUIREMENTS: latest ConTeXt MkIV
-- AUTHOR: Philipp Gesang (Phg), <phg42.2a@gmail.com>
-- CREATED: 2010-12-23 22:12:31+0100
--------------------------------------------------------------------------------
--
thirddata = thirddata or { }
thirddata.translit = thirddata.translit or { }
local translit = thirddata.translit
translit.tables = translit.tables or { }
translit.methods = translit.methods or { }
translit.deficient_font = "no"
translit.parser_cache = { }
local utf8 = unicode.utf8
local utf8byte = utf8.byte
local utf8len = utf8.len
--------------------------------------------------------------------------------
-- Predefining vowel lists
--------------------------------------------------------------------------------
-- If you haven't heard of cyrillic scripts until now you might want to read
-- at least the first 15 pages of
-- http://www.uni-giessen.de/partosch/eurotex99/berdnikov2.pdf
-- before you continue reading this file.
translit.ru_vowels = {"а", "е", "ё", "и", "й", "о", "у", "ы", "э", "ю", "я",
"А", "Е", "Ё", "И", "Й", "О", "У", "Ы", "Э", "Ю", "Я"}
translit.ru_consonants = {"б", "в", "г", "д", "ж", "з", "к", "л", "м", "н",
"п", "р", "с", "т", "ф", "х", "ц", "ч", "ш", "щ",
"Б", "В", "Г", "Д", "Ж", "З", "К", "Л", "М", "Н",
"П", "Р", "С", "Т", "Ф", "Х", "Ц", "Ч", "Ш", "Щ"}
-- Substitution tables are the very heart of the Transliterator. Due to the
-- nature of languages and scripts exhaustive substitution is the simplest
-- method for transliterations and transcriptions unless they are one-to-one
-- mappings like those defined in ISO~9.
--
-- To achieve better reusability we split the tables into segments, the most
-- obvious being the \type{*_low} and \type{*_upp} variants for sets of lowercase
-- and uppercase characters. Another set is constituted by e.~g. the
-- \type{ru_old*} tables that allow adding transcription of historical
-- characters if needed; by the way those are included in the default
-- transliteration mode \type{ru_old}.
-- Tables can be found in separate Lua files.
-- See {\tt
-- trans_tables_glag.lua
-- trans_tables_gr.lua
-- trans_tables_iso9.lua
-- trans_tables_scntfc.lua
-- and
-- trans_tables_trsc.lua.}
--------------------------------------------------------------------------------
-- Metatables allow for lazy concatenation.
--------------------------------------------------------------------------------
do
-- This returns the Union of both key sets for the “+” operator.
-- The values of the first table will be updated (read: overridden) by
-- those given in the second.
local Dict_add = {
__add = function (dict_a, dict_b)
assert (type(dict_a) == "table" and type(dict_b) == "table")
local dict_result = setmetatable({}, Dict_add)
for key, val in pairs(dict_a) do
dict_result[key] = val
end
for key, val in pairs(dict_b) do
dict_result[key] = val
end
return dict_result
end
}
translit.make_add_dict = function (dict)
return setmetatable(dict, Dict_add)
end
end
--------------------------------------------------------------------------------
-- Auxiliary Functions
--------------------------------------------------------------------------------
-- Generate a rule pattern from hash table.
do
local P, R, V = lpeg.P, lpeg.R, lpeg.V
-- multi-char rules first
translit.addrules = function (dict, rules)
local by_length, occurring_lengths = { }, { }
for chr, _ in next, dict do
local l = utf8len(chr)
if not by_length[l] then
by_length[l] = { }
occurring_lengths[#occurring_lengths+1] = l
end
by_length[l][#by_length[l]+1] = chr
end
table.sort(occurring_lengths)
for i=#occurring_lengths, 1, -1 do
local l = occurring_lengths[i]
for _, chr in next, by_length[l] do
rules = rules and rules + P(chr) or P(chr)
end
end
return rules
end
-- Modified version of Hans’s utf pattern (l-lpeg.lua).
translit.utfchar = P{
V"utf8one" + V"utf8two" + V"utf8three" + V"utf8four",
utf8next = R("\128\191"),
utf8one = R("\000\127"),
utf8two = R("\194\223") * V"utf8next",
utf8three = R("\224\239") * V"utf8next" * V"utf8next",
utf8four = R("\240\244") * V"utf8next" * V"utf8next" * V"utf8next",
}
end
-- We might want to have all the table data nicely formatted by \CONTEXT\
-- itself, here's how we'll do it. \type{translit.show_tab(t)} handles a
-- single table \type{t}, builds a Natural TABLE out of its content and
-- hands it down to the machine for typesetting. For debugging purposes it
-- does not only print the replacement pairs but shows their code points as
-- well.
-- handle the input chars and replacement values
local strempty = function (s)
if s == "" then return "nil"
else
-- add the unicode positions of the replacements (can be more
-- than one with composed diacritics
local i = 1
local r = ""
repeat
r = r .. utf8byte(s,i) .. " "
i = i + 1
until utf8byte(s,i) == nil
return r
end
end
function translit.show_tab (tab)
-- Output a transliteration table, nicely formatted with natural tables.
-- Lots of calls to context() but as it’s only a goodie this doesn’t
-- really matter.
local cnt = 0
context.setupTABLE({"r"}, {"each"}, {style="\\tfx", align="center"})
context.setupTABLE({"c"}, {"each"}, {frame="off"})
context.setupTABLE({"r"}, {"each"}, {frame="off"})
context.setupTABLE({"c"}, {"first"}, {style="italic"})
context.setupTABLE({"r"}, {"first"}, {style="bold", topframe="on", bottomframe="on"})
context.setupTABLE({"r"}, {"last"}, {style="bold", topframe="on", bottomframe="on"})
context.bTABLE({split="yes", option="stretch"})
context.bTABLEhead()
context.bTR()
context.bTH() context("number") context.eTH()
context.bTH() context("letters") context.eTH()
context.bTH() context("n") context.eTH()
context.bTH() context("replacement") context.eTH()
context.bTH() context("n") context.eTH()
context.bTH() context("bytes") context.eTH()
context.bTH() context("repl. bytes") context.eTH()
context.eTR()
context.eTABLEhead()
context.bTABLEbody()
for key, val in next,tab do
cnt = cnt + 1
context.bTR()
context.bTC() context(cnt) context.eTC()
context.bTC() context(key) context.eTC()
context.bTC() context(string.len(key)) context.eTC()
context.bTC() context(val) context.eTC()
context.bTC() context(string.len(val)) context.eTC()
context.bTC() context(strempty(key)) context.eTC()
context.bTC() context(strempty(val)) context.eTC()
context.eTR()
end
context.eTABLEbody()
context.bTABLEfoot() context.bTR()
context.bTC() context("number") context.eTC()
context.bTC() context("letters") context.eTC()
context.bTC() context("n") context.eTC()
context.bTC() context("replacement") context.eTC()
context.bTC() context("n") context.eTC()
context.bTC() context("bytes") context.eTC()
context.bTC() context("repl. bytes") context.eTC()
context.eTR()
context.eTABLEfoot()
context.eTABLE()
end
-- Having to pick out single tables for printing can be tedious, therefore we
-- let Lua do the job in our stead. \type{translit.show_all_tabs()} calls
-- \type{translit.show_tab} on every table that is registered with
-- \type{translit.table} -- and uses its registered key as table heading.
function translit.show_all_tabs ()
environment.loadluafile ("trans_tables_iso9")
environment.loadluafile ("trans_tables_trsc")
environment.loadluafile ("trans_tables_scntfc")
environment.loadluafile ("trans_tables_sr")
environment.loadluafile ("trans_tables_trsc")
environment.loadluafile ("trans_tables_glag")
environment.loadluafile ("trans_tables_gr")
translit.gen_rules_en()
translit.gen_rules_de()
-- Output all translation tables that are registered within translit.tables.
-- This will be quite unordered.
context.chapter("Transliterator Showing All Tables")
for key, val in pairs(translit.tables) do
context.section(key)
translit.show_tab (val)
end
end
-- for internal use only
translit.debug_count = 0
function translit.debug_next ()
translit.debug_count = translit.debug_count + 1
context("\\tfxx{\\bf translit debug msg. nr.~" .. translit.debug_count .. "}")
end
--------------------------------------------------------------------------------
-- User-level Function
--------------------------------------------------------------------------------
-- \type{translit.transliterate(m, t)} constitutes the
-- metafunction that is called by the \type{\transliterate} command.
-- It loads the transliteration tables according to \type{method} and calls the
-- corresponding function.
-- Those supposedly are the most frequently used so it won’t hurt to preload
-- them. The rest will be loaded on request.
environment.loadluafile ("trans_tables_iso9")
function translit.transliterate (method, text)
local methods = translit.methods
if not methods[method] then -- register tables and method
if method == "ru_transcript_de" or
method == "ru_transcript_de_exp" or -- experimental lpeg
method == "ru_transcript_en" or
method == "ru_transcript_en_exp" or
method == "ru_cz" or
method == "ocs_cz" then
environment.loadluafile ("trans_tables_trsc")
elseif method == "iso9_ocs" or
method == "iso9_ocs_hack" or
method == "ocs" or
method == "ocs_gla" then
environment.loadluafile ("trans_tables_scntfc")
elseif method:match("^sr_") then
environment.loadluafile ("trans_tables_sr")
elseif method:match("^bg_") then -- only bg_de for now
environment.loadluafile ("trans_tables_bg")
elseif method == "gr" or
method == "gr_n" then
environment.loadluafile ("trans_tables_gr")
end
end
if translit.__script then
return methods[method](text)
end
context ( methods[method](text) )
end
-- vim:sw=4:ts=4:expandtab:ft=lua
|