1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
/* Multibyte character data type.
Copyright (C) 2001, 2005-2007, 2009-2010, 2021,2024 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
/* Written by Bruno Haible <bruno@clisp.org>. */
#ifndef _SHMBCHAR_H
#define _SHMBCHAR_H 1
#if defined (HANDLE_MULTIBYTE)
#include <string.h>
/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
<wchar.h>.
BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
<wchar.h>. */
#include <stdio.h>
#include <time.h>
#include <wchar.h>
#include <wctype.h>
/* is_basic(c) tests whether the single-byte character c is
- in the ISO C "basic character set" or is one of '@', '$', and '`'
which ISO C 23 ยง 5.2.1.1.(1) guarantees to be single-byte and in
practice are safe to treat as basic in the execution character set,
or
- in the POSIX "portable character set", which
<https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap06.html>
equally guarantees to be single-byte. */
#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
&& ('$' == 36) && ('%' == 37) && ('&' == 38) && ('\'' == 39) \
&& ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) \
&& (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) \
&& ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) \
&& ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) \
&& ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) \
&& ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) \
&& ('@' == 64) && ('A' == 65) && ('B' == 66) && ('C' == 67) \
&& ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) \
&& ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) \
&& ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) \
&& ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) \
&& ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) \
&& ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) \
&& ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) \
&& ('`' == 96) && ('a' == 97) && ('b' == 98) && ('c' == 99) \
&& ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) \
&& ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) \
&& ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) \
&& ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) \
&& ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) \
&& ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) \
&& ('|' == 124) && ('}' == 125) && ('~' == 126)
/* The character set is ISO-646, not EBCDIC. */
# define IS_BASIC_ASCII 1
extern const unsigned int is_basic_table[];
static inline int
is_basic (char c)
{
return (is_basic_table [(unsigned char) c >> 5] >> ((unsigned char) c & 31))
& 1;
}
#if 0
/* XXX - FUTURE */
/* All locale encodings (see localcharset.h) map the characters 0x00..0x7F
to U+0000..U+007F, like ASCII, except for
CP864 different mapping of '%'
SHIFT_JIS different mappings of 0x5C, 0x7E
JOHAB different mapping of 0x5C
However, these characters in the range 0x20..0x7E are in the ISO C
"basic character set" and in the POSIX "portable character set", which
ISO C and POSIX guarantee to be single-byte. Thus, locales with these
encodings are not POSIX compliant. And they are most likely not in use
any more (as of 2023). */
#define is_basic(c) ((unsigned char) (c) < 0x80)
#endif
#else
static inline int
is_basic (char c)
{
switch (c)
{
case '\b': case '\r': case '\n':
case '\t': case '\v': case '\f':
case ' ': case '!': case '"': case '#': case '$': case '%':
case '&': case '\'': case '(': case ')': case '*':
case '+': case ',': case '-': case '.': case '/':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case ':': case ';': case '<': case '=': case '>':
case '?': case '@':
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
case '[': case '\\': case ']': case '^': case '_': case '`':
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
case 'p': case 'q': case 'r': case 's': case 't':
case 'u': case 'v': case 'w': case 'x': case 'y':
case 'z': case '{': case '|': case '}': case '~':
return 1;
default:
return 0;
}
}
#endif
#endif /* HANDLE_MULTIBYTE */
#endif /* _SHMBCHAR_H */
|