File: mcel.h

package info (click to toggle)
diffutils 1%3A3.12-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 19,980 kB
  • sloc: ansic: 116,841; sh: 12,881; perl: 535; makefile: 233; sed: 16
file content (307 lines) | stat: -rw-r--r-- 10,261 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
/* Multi-byte characters, Error encodings, and Lengths (MCELs)
   Copyright 2023-2025 Free Software Foundation, Inc.

   This file is free software: you can redistribute it and/or modify
   it under the terms of the GNU Lesser General Public License as
   published by the Free Software Foundation; either version 2.1 of the
   License, or (at your option) any later version.

   This file is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

/* Written by Paul Eggert.  */

/* The macros in this file implement multi-byte character representation
   and forward iteration through a multi-byte string.
   They are simpler and can be faster than the mbiter family.
   However, they do not support obsolescent encodings like CP864,
   EBCDIC, Johab, and Shift JIS that glibc also does not support,
   and it is up to the caller to coalesce encoding-error bytes if desired.

   The mcel_scan function lets code iterate through an array of bytes,
   supporting character encodings in practical use
   more simply than using plain mbrtoc32.

   Instead of this single-byte code:

      char *p = ..., *lim = ...;
      for (; p < lim; p++)
        process (*p);

   You can use this multi-byte code:

      char *p = ..., *lim = ...;
      for (mcel_t g; p < lim; p += g.len)
        {
          g = mcel_scan (p, lim);
          process (g);
        }

   You can select from G using G.ch, G.err, and G.len.
   G is an encoding error if G.err is nonzero, a character otherwise.

   The mcel_scanz function is similar except it works with a
   string of unknown but positive length that is terminated with '\0'.
   Instead of this single-byte code:

      char *p = ...;
      for (; *p; p++)
        process (*p);

   You can use this multi-byte code:

      char *p = ...;
      for (mcel_t g; *p; p += g.len)
        {
          g = mcel_scanz (p);
          process (g);
        }

   mcel_scant (P, TERMINATOR) is like mcel_scanz (P) except the
   string is terminated by TERMINATOR.  The C standard says that the
   TERMINATORs '\0', '\r', '\n', '.', '/' are safe, as they cannot be
   a part (even a trailing byte) of a multi-byte character.
   In practice TERMINATOR is safe if 0 <= TERMINATOR <= 0x2f (ASCII '/').

   mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values.

   mcel_cmp (G1, G2) compares two mcel_t values lexicographically by
   character or by encoding byte value, with encoding bytes sorting
   after characters.

   Calls like c32isalpha (G.ch) test G; they return false for encoding
   errors since calls like c32isalpha (0) return false.  Calls like
   mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2),
   but transliterate first.

   Although ISO C and POSIX allow encodings that have shift states or
   that can produce multiple characters from an indivisible byte sequence,
   POSIX does not require support for these encodings,
   they are not in practical use on GNUish platforms,
   and omitting support for them simplifies the API.  */

#ifndef _MCEL_H
#define _MCEL_H 1

#if !_GL_CONFIG_H_INCLUDED
 #error "Please include config.h first."
#endif

#include <verify.h>

#include <limits.h>
#include <stddef.h>
#include <uchar.h>

_GL_INLINE_HEADER_BEGIN
#ifndef MCEL_INLINE
# define MCEL_INLINE _GL_INLINE
#endif

#ifdef __cplusplus
extern "C" {
#endif


/* Pacify GCC re type limits.  */
#if 4 < __GNUC__ + (3 <= __GNUC_MINOR__) && !defined __clang__
# pragma GCC diagnostic ignored "-Wtype-limits"
#endif

/* The maximum multi-byte character length supported on any platform.
   This can be less than MB_LEN_MAX because many platforms have a
   large MB_LEN_MAX to allow for stateful encodings, and mcel does not
   support these encodings.  MCEL_LEN_MAX is enough for UTF-8, EUC,
   Shift-JIS, GB18030, etc.  In all multi-byte encodings supported by glibc,
   0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX.  */
enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };

/* Bounds for mcel_t members.  */
enum { MCEL_CHAR_MAX = 0x10FFFF };
enum { MCEL_ERR_MIN = 0x80 };

/* mcel_t is a type representing a character CH or an encoding error byte ERR,
   along with a count of the LEN bytes that represent CH or ERR.
   If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX;
   otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR,
   CH == 0, and LEN == 1.  */
typedef struct
{
  char32_t ch;
  unsigned char err;
  unsigned char len;
} mcel_t;

/* Every multi-byte character length fits in mcel_t's LEN.  */
static_assert (MB_LEN_MAX <= UCHAR_MAX);

/* Shifting an encoding error byte left by this value
   suffices to sort encoding errors after characters.  */
enum { MCEL_ERR_SHIFT = 14 };
static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT);

/* Unsigned char promotes to int.  */
static_assert (UCHAR_MAX <= INT_MAX);

/* Bytes have 8 bits, as POSIX requires.  */
static_assert (CHAR_BIT == 8);

#ifndef _GL_LIKELY
/* Rely on __builtin_expect, as provided by the module 'builtin-expect'.  */
# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
#endif

/* mcel_t constructors.  */
MCEL_INLINE mcel_t
mcel_ch (char32_t ch, size_t len)
{
  assume (0 < len);
  assume (len <= MCEL_LEN_MAX);
  assume (ch <= MCEL_CHAR_MAX);
  return (mcel_t) {.ch = ch, .len = len};
}
MCEL_INLINE mcel_t
mcel_err (unsigned char err)
{
  assume (MCEL_ERR_MIN <= err);
  return (mcel_t) {.err = err, .len = 1};
}

/* Compare C1 and C2, with encoding errors sorting after characters.
   Return <0, 0, >0 for <, =, >.  */
MCEL_INLINE int
mcel_cmp (mcel_t c1, mcel_t c2)
{
  int ch1 = c1.ch, ch2 = c2.ch;
  return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2);
}

/* Apply the uchar translator TO to C1 and C2 and compare the results,
   with encoding errors sorting after characters,
   Return <0, 0, >0 for <, =, >.  */
MCEL_INLINE int
mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2)
{
  int cmp = mcel_cmp (c1, c2);
  if (_GL_LIKELY ((c1.err - c2.err) | !cmp))
    return cmp;
  int ch1 = to (c1.ch), ch2 = to (c2.ch);
  return ch1 - ch2;
}

/* Whether C represents itself as a Unicode character
   when it is the first byte of a single- or multi-byte character.
   These days it is safe to assume ASCII, so do not support
   obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS.  */
MCEL_INLINE bool
mcel_isbasic (char c)
{
  return _GL_LIKELY (0 <= c && c < MCEL_ERR_MIN);
}

/* With mcel there should be no need for the performance overhead of
   replacing glibc mbrtoc32, as callers shouldn't care whether the
   C locale treats a byte with the high bit set as an encoding error.  */
#ifdef __GLIBC__
# undef mbrtoc32
#endif

/* Scan bytes from P inclusive to LIM exclusive.  P must be less than LIM.
   Return the character or encoding error starting at P.  */
MCEL_INLINE mcel_t
mcel_scan (char const *p, char const *lim)
{
  /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
     In supported encodings, the first byte of a multi-byte character
     cannot be an ASCII byte.  */
  char c = *p;
  if (mcel_isbasic (c))
    return mcel_ch (c, 1);

  /* An initial mbstate_t; initialization optimized for some platforms.
     For details about these and other platforms, see wchar.in.h.  */
#if (defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__) \
     && !defined __UCLIBC__)
  /* Although only a trivial optimization, it's worth it for GNU.  */
  mbstate_t mbs; mbs.__count = 0;
#elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \
       || (defined __APPLE__ && defined __MACH__))
  /* These platforms have 128-byte mbstate_t.  What were they thinking?
     Initialize just for supported encodings (UTF-8, EUC, etc.).
     Avoid memset because some compilers generate function call code.  */
  struct mbhidden { char32_t ch; int utf8_want, euc_want; }
    _GL_ATTRIBUTE_MAY_ALIAS;
  union { mbstate_t m; struct mbhidden s; } u;
  u.s.ch = u.s.utf8_want = u.s.euc_want = 0;
# define mbs u.m
#elif defined __NetBSD__
  /* Experiments on both 32- and 64-bit NetBSD platforms have
     shown that it doesn't work to clear fewer than 24 bytes.  */
  struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS;
  union { mbstate_t m; struct mbhidden s; } u;
  u.s.a = u.s.b = u.s.c = 0;
# define mbs u.m
#else
  /* mbstate_t has unknown structure or is not worth optimizing.  */
  mbstate_t mbs = {0};
#endif

  char32_t ch;
  size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);

#undef mbs

  /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
     is not supported and MB_LEN_MAX is small.  */
  if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
    return mcel_err (c);

  /* A multi-byte character.  LEN must be positive,
     as *P != '\0' and shift sequences are not supported.  */
  return mcel_ch (ch, len);
}

/* Scan bytes from P, a byte sequence terminated by TERMINATOR.
   If *P == TERMINATOR, scan just that byte; otherwise scan
   bytes up to but not including TERMINATOR.
   TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'.
   Return the character or encoding error starting at P.  */
MCEL_INLINE mcel_t
mcel_scant (char const *p, char terminator)
{
  /* Handle ASCII quickly for speed.  */
  if (mcel_isbasic (*p))
    return mcel_ch (*p, 1);

  /* Defer to mcel_scan for non-ASCII.  Compute length with code that
     is typically faster than strnlen.  */
  char const *lim = p + 1;
  for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
    lim += *lim != terminator;
  return mcel_scan (p, lim);
}

/* Scan bytes from P, a byte sequence terminated by '\0'.
   If *P == '\0', scan just that byte; otherwise scan
   bytes up to but not including '\0'.
   Return the character or encoding error starting at P.  */
MCEL_INLINE mcel_t
mcel_scanz (char const *p)
{
  return mcel_scant (p, '\0');
}


#ifdef __cplusplus
}
#endif

_GL_INLINE_HEADER_END

#endif /* _MCEL_H */