File: utf8-width.c

package info (click to toggle)
zsv 1.3.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 49,160 kB
  • sloc: ansic: 175,811; cpp: 56,301; sh: 3,623; makefile: 3,048; javascript: 577; cs: 90; awk: 70; python: 41; sql: 15
file content (60 lines) | stat: -rw-r--r-- 1,853 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#include <utf8proc.h>

static int has_multibyte_char(const char *utf8, size_t len) {
  // this can be further optimized with simd
  // but doing so is probably not noticeably impactful
  // since we only do this for the finite number of cells on the screen
  uint64_t x;
  while (len >= 8) {
    len -= 8;
    // Copy the first 8 bytes into 64-bit integers
    memcpy(&x, utf8, sizeof(x));

    // Check if any high bits are set
    if ((x & 0x8080808080808080ULL) != 0)
      return 1;
    utf8 += 8;
  }
  if (len) {
    x = 0;
    memcpy(&x, utf8, len);
    if ((x & 0x8080808080808080ULL) != 0)
      return 1;
  }
  return 0;
}

static size_t is_newline(const unsigned char *utf8, int wchar_len) {
  return (wchar_len == 1 && strchr("\n\r", utf8[0]));
  // add multibyte newline check?
}

static size_t utf8_bytes_up_to_max_width_and_replace_newlines(unsigned char *str1, size_t len1, size_t max_width,
                                                              size_t *used_width, int *err) {
  utf8proc_int32_t codepoint1;
  utf8proc_ssize_t bytes_read1;
  size_t width_so_far = *used_width = 0;
  int this_char_width = 0;
  size_t bytes_so_far = 0;
  while (bytes_so_far < len1) {
    bytes_read1 = utf8proc_iterate((utf8proc_uint8_t *)str1 + bytes_so_far, len1, &codepoint1);
    if (!bytes_read1) {
      bytes_read1 = 1;
      *err = 1;
      this_char_width = 1;
    } else if (is_newline(str1 + bytes_so_far, bytes_read1)) {
      memset((void *)(str1 + bytes_so_far), ' ', bytes_read1);
      continue;
    } else {
      this_char_width = utf8proc_charwidth(codepoint1);
      if (width_so_far + this_char_width > max_width) {
        *used_width = width_so_far;
        return bytes_so_far;
      }
    }
    width_so_far += this_char_width;
    bytes_so_far += bytes_read1;
  }
  *used_width = width_so_far;
  return bytes_so_far;
}