1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
|
/* getcwidth - Get the OS's idea of the width of Unicode codepoints
*
* This code is Copyright (c) 2013, by the authors of nmh. See the
* COPYRIGHT file in the root directory of the nmh distribution for
* complete copyright information.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifdef MULTIBYTE_SUPPORT
#include <inttypes.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>
#endif
#ifdef MULTIBYTE_SUPPORT
static void usage(char *);
static void dumpwidth(void);
static void dumpctype(void);
static void getwidth(const char *);
#endif /* MULTIBYTE_SUPPORT */
int
main(int argc, char *argv[])
{
#ifndef MULTIBYTE_SUPPORT
(void) argc;
(void) argv;
fprintf(stderr, "Nmh was not configured with multibyte support\n");
exit(1);
#else /* MULTIBYTE_SUPPORT */
wchar_t c;
int i;
if (! setlocale(LC_ALL, "")) {
fprintf(stderr, "setlocale failed, check your LC_ALL, "
"LC_CTYPE, and LANG environment variables\n");
}
if (argc < 2)
usage(argv[0]);
if (strcmp(argv[1], "--dump") == 0) {
if (argc == 2) {
dumpwidth();
exit(0);
} else {
fprintf(stderr, "--dump cannot be combined with "
"other arguments\n");
exit(1);
}
}
if (strcmp(argv[1], "--ctype") == 0) {
if (argc != 2) {
fprintf(stderr, "--ctype cannot be combined with other arguments\n");
exit(1);
}
dumpctype();
exit(0);
}
/*
* Process each argument. If it begins with "U+", then try to
* convert it to a Unicode codepoint. Otherwise, take each
* string and get the total width
*/
for (i = 1; i < argc; i++) {
if (strncmp(argv[i], "U+", 2) == 0) {
/*
* We're making a big assumption here that
* wchar_t represents a Unicode codepoint.
* That technically isn't valid unless the
* C compiler defines __STDC_ISO_10646__, but
* we're going to assume now that it works.
*/
errno = 0;
c = strtoul(argv[i] + 2, NULL, 16);
if (errno) {
fprintf(stderr, "Codepoint %s invalid\n",
argv[i]);
continue;
}
printf("%d\n", wcwidth(c));
} else {
getwidth(argv[i]);
}
}
exit(0);
}
static void
usage(char *argv0)
{
fprintf(stderr, "Usage: %s [--dump]\n", argv0);
fprintf(stderr, " %s [--ctype]\n", argv0);
fprintf(stderr, " %s U+XXXX [...]\n", argv0);
fprintf(stderr, " %s utf-8-sequence [...]\n", argv0);
fprintf(stderr, "Returns the column width of a Unicode codepoint "
"or UTF-8 character sequence\n");
fprintf(stderr, "\t--dump\tDump complete width table\n");
fprintf(stderr, "\t--ctype\tPrint wctype(3) table.\n");
exit(1);
}
static void
getwidth(const char *string)
{
wchar_t c;
int charlen, charleft = strlen(string);
int length = 0;
/*
* In theory we should be able to use wcswidth(), but since we're
* testing out how the format libraries behave we'll do it a character
* at a time.
*/
if (mbtowc(NULL, NULL, 0)) {}
while (charleft > 0) {
int clen;
charlen = mbtowc(&c, string, charleft);
if (charlen == 0)
break;
if (charlen < 0) {
fprintf(stderr, "Unable to convert string \"%s\"\n",
string);
return;
}
if ((clen = wcwidth(c)) < 0) {
fprintf(stderr, "U+%04lX non-printable\n",
(unsigned long int) c);
return;
}
length += clen;
string += charlen;
charleft -= charlen;
}
printf("%d\n", length);
}
typedef struct {
wchar_t min, max;
} unicode_range;
static unicode_range range[] = {
/* https://en.wikipedia.org/wiki/Unicode#Code_point_planes_and_blocks */
{ L'\x0000', L'\xff' },
#if WCHAR_MAX >= 0xffff
{ L'\x0100', L'\xffff' },
#if WCHAR_MAX >= 0xfffff
{ L'\x10000', L'\x14fff' },
{ L'\x16000', L'\x18fff' },
{ L'\x1b000', L'\x1bfff' },
{ L'\x1d000', L'\x1ffff' },
{ L'\x20000', L'\x2ffff' },
{ L'\xe0000', L'\xe0fff' },
#endif
#endif
{ L'\0', L'\0' }, /* Terminates list. */
};
static void
dumpwidth(void)
{
unicode_range *r;
int first;
wchar_t wc, start;
int width, lastwidth;
for (r = range; r->max; r++) {
first = 1;
for (wc = r->min; wc <= r->max; wc++) {
width = wcwidth(wc);
if (first) {
start = wc;
lastwidth = width;
first = 0;
continue;
}
if (width != lastwidth) {
printf("%04lX - %04lX = %d\n", (unsigned long)start,
(unsigned long int)wc - 1, lastwidth);
start = wc;
lastwidth = width;
}
if (wc == r->max) {
printf("%04lX - %04lX = %d\n", (unsigned long)start,
(unsigned long int)wc, lastwidth);
/* wchar_t can be a 16-bit unsigned short. */
break;
}
}
}
}
static void
dumpctype(void)
{
unicode_range *r;
wchar_t wc;
for (r = range; r->max; r++) {
for (wc = r->min; wc <= r->max; wc++) {
printf("%6" PRIxMAX " %2d %c%c%c%c%c%c%c%c%c%c%c%c\n",
(uintmax_t)wc, wcwidth(wc),
iswcntrl(wc) ? 'c' : '-',
iswprint(wc) ? 'p' : '-',
iswgraph(wc) ? 'g' : '-',
iswalpha(wc) ? 'a' : '-',
iswupper(wc) ? 'u' : '-',
iswlower(wc) ? 'l' : '-',
iswdigit(wc) ? 'd' : '-',
iswxdigit(wc) ? 'x' : '-',
iswalnum(wc) ? 'N' : '-',
iswpunct(wc) ? '@' : '-',
iswspace(wc) ? 's' : '-',
iswblank(wc) ? 'b' : '-');
if (wc == r->max)
/* wchar_t can be a 16-bit unsigned short. */
break;
}
}
#endif /* MULTIBYTE_SUPPORT */
}
|