1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
|
#ifndef CTYPE_ASCII_INCLUDED
#define CTYPE_ASCII_INCLUDED
#include "myisampack.h"
/*
Magic expression. It uses the fact that for any byte value X in
the range 0..31 (0x00..0x1F) the expression (X+31)*5 returns
the 7th bit (0x80) set only for the following six (out of 32) values:
0x00, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F.
These values correspond to offsets of non-letter characters
in the ASCII table:
The following macro sets the bit 0x20 for the following characters:
---------------- --------------------------------
Magic bit 10000000000000000000000000011111
ASCII 0x00..0x1F ................................ Control
ASCII 0x20..0x3F ................................ Punctuation, digits
ASCII 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_
ASCII 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~.
---------------- --------------------------------
We shift the magic bit 0x80 right twice to make it 0x20.
So on the ranges [40..5F] and [60..7F] the expression
has the bit 0x20 set for all non-letter characters.
Note, other bits contain garbage.
Requirements:
All bytes must be in the range [00..7F],
to avoid overflow and carry to the next byte.
*/
#define MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC(i) \
(((((i)+0x1F1F1F1F1F1F1F1FULL) & 0x1F1F1F1F1F1F1F1F) * 5) >> 2)
/*
The following macro returns the bit 0x20 set to:
- 1 for input bytes in the ranges [60..7F] or [E0..FF]
- 0 otherwise
Bytes in the ranges [40..7F] and [C0..FF] have the bit 0x40 set.
Bytes in the ranges [60..7F] and [E0..FF] have the bit 0x20 set.
Hex BinHi BinLo
---- -1-- ----
0x[4C]X .10. ....
0x[5D]X .10. ....
0x[6E]X .11. ....
0x[7F]X .11. ....
*/
#define MY_ASCII_20_IS_SET_IF_RANGE_60_7F_OR_E0_FF(i) (((i) >> 1) & ((i)))
/*
The following macro evaluates to exactly 0x20 for all
lower case ASCII letters [a-z], and to 0x00 otherwise:
Value Range Character range Subrange
-------- -------- -------------------------------- -------
00000000 0x00..0x3F Control, punctuation, digits
00100000 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ letters A-Z
00000000 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ non-letters
00100000 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~. letters a-z
00000000 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~. non-letters
Requirements:
All bytes must be in the range [00..7F].
See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC().
*/
#define MY_ASCII_20_IF_IS_LOWER_LETTER(i) \
(MY_ASCII_20_IS_SET_IF_RANGE_60_7F_OR_E0_FF(i) & \
~MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC(i) & \
0x2020202020202020)
/*
Convert lower case ASCII letters to upper case by unsetting
the bit 0x20 with help of the magic expression.
Requirements:
All bytes must be in the range [00..7F].
See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC()
*/
#define MY_ASCII_TOUPPER_MAGIC(i) \
(i ^ MY_ASCII_20_IF_IS_LOWER_LETTER(i))
/*
Convert a string (consisting of 8 bytes stored in uint64)
to upper case algorithmically.
Requirements:
All bytes must be in the range [00..0x7F].
See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC().
The result on 8bit data is unpredictable!!!
The caller should make sure not to pass 8bit data.
*/
static inline ulonglong my_ascii_to_upper_magic_uint64(ulonglong i)
{
return MY_ASCII_TOUPPER_MAGIC(i);
}
/*
Check if:
- both strings "a" and "b" have at least 4 bytes, and
- both strings have only 7bit data.
*/
static inline int
my_strcoll_ascii_4bytes_found(const uchar *a, const uchar *ae,
const uchar *b, const uchar *be)
{
return a + 4 <= ae && b + 4 <= be &&
(uint4korr(b) & 0x80808080) == 0 &&
(uint4korr(a) & 0x80808080) == 0;
}
/*
Compare the leading four 7bit ASCII bytes in two strings case insensitively
by converting letters [a-z] to upper case [A-Z].
Requirements:
- The input strings must have at least four bytes, and
- The leading four bytes in both strings must be 7bit ASCII.
The caller must make sure to provide only strings that meet
these requirements. The result on 8-bit data is unpredictable
as 8-bit bytes may cause overflow in my_ascii_to_upper_magic_uint64().
See comments above.
*/
static inline int
my_strcoll_ascii_toupper_4bytes(const uchar *a, const uchar *b)
{
ulonglong abn= (((ulonglong) mi_uint4korr(a)) << 32) | mi_uint4korr(b);
abn= my_ascii_to_upper_magic_uint64(abn);
if ((uint32) (abn >> 32) == (uint32) abn)
return 0;
return ((uint32) (abn >> 32)) < ((uint32) abn) ? -1 : + 1;
}
/*
Compare the leading eight 7bit ASCII bytes in two strings case insensitively
by converting letters [a-z] to upper case [A-Z].
Requirements:
- The input strings must have at least eight bytes, and
- The leading eight bytes in both strings must be 7bit ASCII.
See comments in my_strcoll_ascii_toupper_4bytes().
*/
static inline int
my_strcoll_ascii_toupper_8bytes(const uchar *a, const uchar *b)
{
/*
TODO:
Try to get advantage of SIMD instructions by massive comparison
(16 bytes at a time) of characters against (x>='a' && x<='z') using:
- either explicit intrinsics
- or a loop that can get vectorized automatically by some compilers.
*/
ulonglong an= mi_uint8korr(a);
ulonglong bn= mi_uint8korr(b);
an= my_ascii_to_upper_magic_uint64(an);
bn= my_ascii_to_upper_magic_uint64(bn);
return an == bn ? 0 : an < bn ? -1 : +1;
}
/*
Compare the leading four 7bit ASCII bytes in two strings in binary style.
*/
static inline int
my_strcoll_mb7_bin_4bytes(const uchar *a, const uchar *b)
{
uint32 an= mi_uint4korr(a);
uint32 bn= mi_uint4korr(b);
return an == bn ? 0 : an < bn ? -1 : +1;
}
/*
Compare the leading four 7bit ASCII bytes in two strings in binary style.
*/
static inline int
my_strcoll_mb7_bin_8bytes(const uchar *a, const uchar *b)
{
ulonglong an= mi_uint8korr(a);
ulonglong bn= mi_uint8korr(b);
return an == bn ? 0 : an < bn ? -1 : +1;
}
#endif /* CTYPE_ASCII_INCLUDED */
|