1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
|
-- object for Unicode string collation
local math = require "math"
local tailoring_lib = require "lua-uca.lua-uca-tailoring"
local reordering_table = require "lua-uca.lua-uca-reordering-table"
local uni_normalize
-- in LuaTeX, load the lua-uni-normalize library
if kpse then
uni_normalize = require "lua-uni-normalize"
end
local collator = {}
collator.__index = collator
local function copy_table(tbl)
local t = {}
for k, v in pairs(tbl) do
if type(v) == "table" then
t[k] = copy_table(v)
else
t[k] = v
end
end
return t
end
collator.copy_table = copy_table
function collator.new(codes)
local self = setmetatable({}, collator)
-- tree with mappings from codepoints to collation elements
self.codes = copy_table(codes or {})
-- cached sort keys
self.stringcache = {}
self.tailoring_multiplier = {1, 1, 1, 1}
return self
end
-- make tree that points weights back to codepoints
function collator:weight_to_codepoints(codes)
local weights = {}
local codes = codes or self.codes
local function insert_weight(element, codepoints, weights)
local values = element.value
local weights = weights
if values and not values.equal then
values = values[1]
local current = weights
for i, weight in ipairs(values) do
local new = current[weight] or {}
-- multiple characters can have the same weight.
if i == #values and not new.codepoints then
new.codepoints = codepoints
end
current[weight] = new
current = new
end
end
local children = element.children or {}
for codepoint, element in pairs(children) do
-- make copy of the codes table, so the new codepoints are not added indefinitely
local newcodes = {table.unpack(codepoints)}
newcodes[#newcodes + 1] = codepoint
insert_weight(element, newcodes, weights)
end
end
for codepoint, element in pairs(codes) do
insert_weight(element, {codepoint}, weights)
end
return weights
end
function collator:get_implicit_weight(codepoints, pos)
-- implicit weight is based on the codepoint value
local codepoint = codepoints[pos]
return {{codepoint, 0, 0}}, pos + 1
end
function collator:read_weight(codepoints, pos)
-- try to find contractions and return weight for longest matched string
-- in the database
local function read_children(parent, pos)
local children = parent.children or {}
local newpos = pos + 1
newcodepoint = codepoints[newpos]
-- if we go out of the codepoint array
if not newcodepoint then return nil end
local child = children[newcodepoint]
if child then
local nextchild, nextpos = read_children(child, newpos)
if nextchild then return nextchild, nextpos end
if child.value then return child.value, newpos end
end
return nil
end
local weights
local current_codepoint = codepoints[pos]
local codes = self.codes[current_codepoint]
if not codes then return nil, pos + 1 end
-- first try to read contractions
weights, new_pos = read_children(codes, pos)
if weights then return weights, new_pos + 1 end
-- if no contraction, weights are in the value field
return codes.value, pos + 1
end
-- get weights for the next characters
function collator:get_weights(codepoints, pos)
local weights, next_pos = self:read_weight(codepoints, pos)
-- return implicit weights for codepoints that are not in the database
if not weights then
weights, next_pos = self:get_implicit_weight(codepoints, pos)
end
-- don't step next_pos if it is larger than size of the codepoints array
if next_pos > #codepoints then next_pos = nil end
return weights, next_pos
end
-- get characters with lowest weight at the given point of the codepoint array
-- useful for index header strings
function collator:get_lowest_char(codepoints, pos)
-- find the lowest key in the table
local min = math.min
local max = math.max
local minimal = 0xffffffffff
local maximal = -0xfffffffff
local function get_lowest_key(tbl)
local minimal, maximal = minimal, maximal
if tbl.minimal then return tbl.minimal, tbl.maximal end
for k,v in pairs(tbl) do
minimal,maximal = min(k, minimal), max(k, maximal)
end
tbl.minimal, tbl.maximal = minimal, maximal
return minimal, maximal
end
-- traverse tree, select lowest keys and find the codepoints
local function find_codepoints(tbl, level)
local level = level or 1
local tbl = tbl or {}
if tbl.codepoints then return tbl.codepoints end
local newmin, newmax = get_lowest_key(tbl)
-- this shouldn't happen
if newmin == minimal then return nil end
-- if the collation is set to upperace first, then the lowest character has the hightest
-- collation value
if self.is_uppercase_first and level == 2 then
newmin = newmax
end
return find_codepoints(tbl[newmin], level + 1)
end
local weights, next_pos = self:read_weight(codepoints, pos)
local x
if weights then
local weight_to_char = self.weight_to_char or self:weight_to_codepoints()
self.weight_to_char = weight_to_char
-- find the primary weight of the matched character
local first = weights[1][1]
local current = weight_to_char[first]
-- traverse the tree to find the codepoint with the lowest weight
x = find_codepoints(current)
end
return x, next_pos
end
function collator:update_levels(levels, weights)
-- process weight weights
for _, w in ipairs(weights) do
for i, x in ipairs(w) do
-- process collation elements
if x ~= 0 then -- ignore zero elements
-- insert element at the current collation level
local current_level = levels[i] or {}
table.insert(current_level, x)
levels[i] = current_level
end
end
end
return levels
end
-- make sort key from codepoints array (it can be created using the utf8.codes() function
function collator:make_sort_key(codepoints)
local levels = {}
local pos = 1
local weights
local sort_key = {}
while true do
--
weights, pos = self:get_weights(codepoints, pos)
levels = self:update_levels(levels, weights)
-- break when we reach end of the codepoints array
if not pos then break end
end
for i, elements in ipairs(levels) do
for _, element in ipairs(elements) do
table.insert(sort_key, element)
end
-- zero separates levels in the sort key
table.insert(sort_key, 0)
end
return sort_key
end
function collator:compare(a, b)
-- sort using sort keys
local min = math.min(#a, #b)
for i = 1, min do
if a[i] ~= b[i] then return a[i] < b[i] end
end
-- this should happen only when the strings are equal
-- it needs to return false, otherwise the table.sort function reports
-- "invalid order function for sorting" error
return #a < #b
end
local codepoints_cache = {}
function collator:string_to_codepoints(a)
if codepoints_cache[a] then return codepoints_cache[a] end
-- try unicode normalization, if it is available
-- it uses libraries available in LuaTeX, so it doesn't work with
-- stock Lua
local normalized = a
if uni_normalize then
normalized = uni_normalize.NFC(a)
end
local codepoints = {}
for _, code in utf8.codes(normalized) do codepoints[#codepoints+1] = code end
codepoints_cache[a] = codepoints
return codepoints
end
function collator:compare_strings(a,b)
-- sort using strings
local cache = self.stringcache
local get_sortkey = function(x) return self:make_sort_key(self:string_to_codepoints(x)) end
local asortkey = cache[a] or get_sortkey(a)
local bsortkey = cache[b] or get_sortkey(b)
cache[a], cache[b] = asortkey, bsortkey
return self:compare(asortkey, bsortkey)
end
-- update collation codes
function collator:update_codes(key, elements)
local keys = self.codes
local function add_to_tree(tbl, current_pos)
local tbl = tbl or {}
local current_key = key[current_pos]
local el = tbl[current_key] or {}
if current_pos < #key then
el.children = add_to_tree(el.children, current_pos + 1)
elseif current_pos == #key then
el.value = elements
end
tbl[current_key] = el
return tbl
end
keys = add_to_tree(keys, 1)
end
--- change sorting ordering
--
function collator:tailor(base, target, tailoring_table)
-- get the value of the base character
local value = self:get_weights(base, 1)
local new_value = {}
-- create a new collation element
for k, v in ipairs(value) do
local subtable = {}
for x, y in ipairs(v) do
subtable[x] = y + ((tailoring_table[x] or 0) * self.tailoring_multiplier[x] or 1)
end
new_value[k] = subtable
end
-- when tailoring sets an equivialent character, it needs to be ignored in collator:weight_to_codepoints
local is_equivalent = 0
for _, x in ipairs(tailoring_table) do is_equivalent = is_equivalent + x end
if is_equivalent == 0 then new_value.equal = true end
self:update_codes(target, new_value)
end
-- support for CLDR style strings
function collator:tailor_string(s)
tailoring_lib.tailor_string(self, s)
end
-- reorder scripts
-- pass table with script names to reorder
function collator:reorder(tbl)
-- make table of the reordering table
local t = copy_table(reordering_table)
for _, script in ipairs(tbl) do
-- reorder scripts
tailoring_lib.reorder(script, t)
end
-- apply reordering to the collator object
tailoring_lib.reorder_collator(self, t)
end
-- expand characters to another characters
function collator:equal(base, target)
local new_weight = {}
local values, pos
pos = 1
while true do
value, pos = self:get_weights(target, pos)
for _, v in ipairs(value) do
new_weight[#new_weight + 1] = v
end
if not pos then break end
end
self:update_codes(base, new_weight)
end
-- sort uppercase first
function collator:uppercase_first()
-- table with values for uppercase
local uppercase_values = {0x08, 0x09, 0xA, 0xB, 0x0C, 0x0E, 0x11, 0x12, 0x1D}
local is_uppercase = {}
-- we must invert tertiary element in tailoring
self.tailoring_multiplier[3] = -1
self.is_uppercase_first = true
for k,v in ipairs(uppercase_values) do is_uppercase[v] = true end
local function change_case(element)
local value = element.value or {}
for _, collation_element in ipairs(value) do
-- detect if the tertiary element is uppercase
local third = collation_element[3]
-- make new collation element and insert it before the tertiary element
local case = is_uppercase[third] and 1 or 3
table.insert(collation_element, 3, case)
end
-- recursivelly process children
local children = element.children or {}
for x, child in pairs(children) do
change_case(child)
end
end
for _, element in pairs(self.codes) do
change_case(element)
end
end
return collator
|