File: utf.lua

package info (click to toggle)
rspamd 3.13.2-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 34,056 kB
sloc: ansic: 243,746; cpp: 105,657; javascript: 29,539; asm: 2,512; perl: 2,440; pascal: 1,625; python: 1,274; sql: 313; sh: 281; makefile: 140; xml: 74
file content (189 lines) | stat: -rw-r--r-- 5,728 bytes
parent folder | download | duplicates (2)
-- Test utf routines

context("UTF8 check functions", function()
  local ffi = require("ffi")
  ffi.cdef [[
    unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size);
    unsigned int rspamd_str_lc (char *str, unsigned int size);
    void rspamd_fast_utf8_library_init (unsigned flags);
    void ottery_rand_bytes(void *buf, size_t n);
    double rspamd_get_ticks(int allow);
    size_t rspamd_fast_utf8_validate (const unsigned char *data, size_t len);
    size_t rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len);
    char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen, void *);
  ]]

  local cases = {
    { "АбЫрвАлг", "абырвалг" },
    { "АAБBвc", "аaбbвc" },
    --{"STRASSE", "straße"}, XXX: NYI
    { "KEÇİ", "keçi" },
  }

  for i, c in ipairs(cases) do
    test("UTF lowercase " .. tostring(i), function()
      local buf = ffi.new("char[?]", #c[1] + 1)
      ffi.copy(buf, c[1])
      local nlen = ffi.C.rspamd_str_lc_utf8(buf, #c[1])
      local s = ffi.string(buf, nlen)
      assert_equal(s, c[2])
    end)
  end

  cases = {
    { "AbCdEf", "abcdef" },
    { "A", "a" },
    { "AaAa", "aaaa" },
    { "AaAaAaAa", "aaaaaaaa" }
  }

  for i, c in ipairs(cases) do
    test("ASCII lowercase " .. tostring(i), function()
      local buf = ffi.new("char[?]", #c[1] + 1)
      ffi.copy(buf, c[1])
      ffi.C.rspamd_str_lc(buf, #c[1])
      local s = ffi.string(buf)
      assert_equal(s, c[2])
    end)
  end

  cases = {
    { 'тест', 'тест' },
    { '\200\213\202', '���' },
    { 'тест\200\213\202test', 'тест���test' },
    { '\200\213\202test', '���test' },
    { '\200\213\202test\200\213\202', '���test���' },
    { 'тест\200\213\202test\200\213\202', 'тест���test���' },
    { 'тест\200\213\202test\200\213\202тест', 'тест���test���тест' },
  }

  local NULL = ffi.new 'void*'
  for i, c in ipairs(cases) do
    test("Unicode make valid " .. tostring(i), function()
      local buf = ffi.new("char[?]", #c[1] + 1)
      ffi.copy(buf, c[1])

      local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL, NULL))
      local function to_hex(s)
        return (s:gsub('.', function(c)
          return string.format('%02X', string.byte(c))
        end))
      end
      print(to_hex(s))
      print(to_hex(c[2]))
      assert_equal(s, c[2])
    end)
  end

  -- Enable sse and avx2
  ffi.C.rspamd_fast_utf8_library_init(3)
  local valid_cases = {
    "a",
    "\xc3\xb1",
    "\xe2\x82\xa1",
    "\xf0\x90\x8c\xbc",
    "안녕하세요, 세상"
  }
  for i, c in ipairs(valid_cases) do
    test("Unicode validate success: " .. tostring(i), function()
      local buf = ffi.new("char[?]", #c + 1)
      ffi.copy(buf, c)

      local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
      assert_equal(ret, 0)
    end)
  end
  local invalid_cases = {
    "\xc3\x28",
    "\xa0\xa1",
    "\xe2\x28\xa1",
    "\xe2\x82\x28",
    "\xf0\x28\x8c\xbc",
    "\xf0\x90\x28\xbc",
    "\xf0\x28\x8c\x28",
    "\xc0\x9f",
    "\xf5\xff\xff\xff",
    "\xed\xa0\x81",
    "\xf8\x90\x80\x80\x80",
    "123456789012345\xed",
    "123456789012345\xf1",
    "123456789012345\xc2",
    "\xC2\x7F"
  }
  for i, c in ipairs(invalid_cases) do
    test("Unicode validate fail: " .. tostring(i), function()
      local buf = ffi.new("char[?]", #c + 1)
      ffi.copy(buf, c)

      local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
      assert_not_equal(ret, 0)
    end)
  end

  if os.getenv("RSPAMD_LUA_EXPENSIVE_TESTS") then
    local speed_iters = 10000
    local function test_size(buflen, is_valid, impl)
      local logger = require "rspamd_logger"
      local test_str
      if is_valid then
        test_str = table.concat(valid_cases)
      else
        test_str = table.concat(valid_cases) .. table.concat(invalid_cases)
      end

      local buf = ffi.new("char[?]", buflen)
      if #test_str < buflen then
        local t = {}
        local len = #test_str
        while len < buflen do
          t[#t + 1] = test_str
          len = len + #test_str
        end
        test_str = table.concat(t)
      end
      ffi.copy(buf, test_str:sub(1, buflen))

      local tm = 0

      for _ = 1, speed_iters do
        if impl == 'ref' then
          local t1 = ffi.C.rspamd_get_ticks(1)
          ffi.C.rspamd_fast_utf8_validate_ref(buf, buflen)
          local t2 = ffi.C.rspamd_get_ticks(1)
          tm = tm + (t2 - t1)
        elseif impl == 'opt' then
          local t1 = ffi.C.rspamd_get_ticks(1)
          ffi.C.rspamd_fast_utf8_validate(buf, buflen)
          local t2 = ffi.C.rspamd_get_ticks(1)
          tm = tm + (t2 - t1)
        end
      end

      logger.messagex("%s utf8 %s check (valid = %s): %s ticks per iter, %s ticks per byte",
          impl, buflen, is_valid,
          tm / speed_iters, tm / speed_iters / buflen)

      return 0
    end

    for _, sz in ipairs({ 78, 512, 65535 }) do
      test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'valid'), function()
        local res = test_size(sz, true, 'ref')
        assert_equal(res, 0)
      end)
      test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'invalid'), function()
        local res = test_size(sz, false, 'ref')
        assert_equal(res, 0)
      end)
      test(string.format("Utf8 test %s %d buffer, %s", 'opt', sz, 'valid'), function()
        local res = test_size(sz, true, 'opt')
        assert_equal(res, 0)
      end)
      test(string.format("Utf8 test %s %d buffer, %s", 'opt', sz, 'invalid'), function()
        local res = test_size(sz, false, 'opt')
        assert_equal(res, 0)
      end)
    end
  end

end)