1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
|
local decoder = require "luacheck.decoder"
local lua_utf8 = require "lua-utf8"
local function assert_encoding(encoding, ...)
local lib = encoding == "utf8" and lua_utf8 or string
local length = select("#", ...)
local bytes = lib.char(...)
local chars = decoder.decode(bytes)
local label_parts = {"("}
for index = 1, length do
table.insert(label_parts, ("\\u{%X}"):format((select(index, ...))))
end
table.insert(label_parts, ")")
local label = table.concat(label_parts)
assert.equals(length, chars:get_length(), ":get_length" .. label)
for from = 1, length do
for to = from, length do
assert.equals(lib.sub(bytes, from, to), chars:get_substring(from, to), ":get_substring" .. label)
end
end
local iter, state, var
if encoding == "utf8" then
iter, state = lua_utf8.next, bytes
else
iter, state, var = ipairs({...})
end
local index = 1
for offset, codepoint in iter, state, var do
assert.equals(codepoint, chars:get_codepoint(index), ":get_codepoint" .. label)
local from, to, match = chars:find("(.)", index)
assert.equals(offset, from, ":find" .. label)
assert.equals(offset, to, ":find" .. label)
assert.equals(bytes:sub(offset, offset), match, ":find" .. label)
index = index + 1
end
end
describe("decoder", function()
it("decodes valid codepoints correctly", function()
-- Checking literally all codepoints is very slow with coverage enabled, pick only a few.
for base = 0, 0x10FFFF, 0x800 do
for offset = 0, 0x100, 41 do
local codepoint1 = base + offset
local codepoint2 = codepoint1 + 9
assert_encoding("utf8", codepoint1, codepoint2)
end
end
end)
it("falls back to latin1 on invalid utf8", function()
-- Bad first byte.
assert_encoding("latin1", 0xC0, 0x80, 0x80, 0x80)
assert_encoding("latin1", 0x00, 0xF8, 0x80, 0x80, 0x80)
-- Two bytes, bad continuation byte.
assert_encoding("latin1", 0x00, 0xC0, 0x00, 0xC0, 0x80)
assert_encoding("latin1", 0x00, 0xC0, 0xFF, 0xC0, 0x80)
-- Three bytes, bad first continuation byte.
assert_encoding("latin1", 0x00, 0xE0, 0x00, 0xC0, 0x80)
assert_encoding("latin1", 0x00, 0xE0, 0xFF, 0xC0, 0x80)
-- Three bytes, bad second continuation byte.
assert_encoding("latin1", 0x00, 0xE0, 0x80, 0x00, 0xC0, 0x80)
assert_encoding("latin1", 0x00, 0xE0, 0x80, 0xFF, 0xC0, 0x80)
-- Four bytes, bad first continuation byte.
assert_encoding("latin1", 0x00, 0xF0, 0x00, 0xC0, 0x80)
assert_encoding("latin1", 0x00, 0xF0, 0xFF, 0xC0, 0x80)
-- Four bytes, bad second continuation byte.
assert_encoding("latin1", 0x00, 0xF0, 0x80, 0x00, 0xC0, 0x80)
assert_encoding("latin1", 0x00, 0xF0, 0x80, 0xFF, 0xC0, 0x80)
-- Four bytes, bad third continuation byte.
assert_encoding("latin1", 0x00, 0xF0, 0x80, 0x80, 0x00, 0xC0, 0x80)
assert_encoding("latin1", 0x00, 0xF0, 0x80, 0x80, 0xFF, 0xC0, 0x80)
-- Codepoint too large.
assert_encoding("latin1", 0xF7, 0x80, 0x80, 0x80, 0x00)
end)
end)
|