1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
/*
* Here's the plan
* You type a UTF8 string, and it gives you all of the Unicode code points
* (in hex) on output.
*/
#include <stdio.h>
int main (void)
{
unsigned char string[256];
unsigned char * next_point;
unsigned char * this_point;
unsigned long code_point;
unsigned int numbytes;
for (;;)
{
fgets(string, 256, stdin);
if (feof(stdin)) {
exit(0);
}
next_point = this_point = string;
while (*this_point != '\n')
{
code_point = 0;
if (((unsigned long)(*this_point) & 0xFE) == 0xFC) {
numbytes = 6;
code_point = ((unsigned long)this_point[0] & 0x01) << 30;
code_point += ((unsigned long)this_point[1] & 0x3F) << 24;
code_point += ((unsigned long)this_point[2] & 0x3F) << 18;
code_point += ((unsigned long)this_point[3] & 0x3F) << 12;
code_point += ((unsigned long)this_point[4] & 0x3F) << 6;
code_point += ((unsigned long)this_point[5] & 0x3F);
this_point += 6;
} else if (((unsigned long)(*this_point) & 0xFC) == 0xF8) {
numbytes = 5;
code_point = ((unsigned long)this_point[0] & 0x03) << 24;
code_point += ((unsigned long)this_point[1] & 0x3F) << 18;
code_point += ((unsigned long)this_point[2] & 0x3F) << 12;
code_point += ((unsigned long)this_point[3] & 0x3F) << 6;
code_point += ((unsigned long)this_point[4] & 0x3F);
this_point += 5;
} else if (((unsigned long)(*this_point) & 0xF8) == 0xF0) {
numbytes = 4;
code_point = ((unsigned long)this_point[0] & 0x07) << 18;
code_point += ((unsigned long)this_point[1] & 0x3F) << 12;
code_point += ((unsigned long)this_point[2] & 0x3F) << 6;
code_point += ((unsigned long)this_point[3] & 0x3F);
this_point += 4;
} else if (((unsigned long)(*this_point) & 0xF0) == 0xE0) {
numbytes = 3;
code_point = ((unsigned long)this_point[0] & 0x0F) << 12;
code_point += ((unsigned long)this_point[1] & 0x3F) << 6;
code_point += ((unsigned long)this_point[2] & 0x3F);
this_point += 3;
} else if (((unsigned long)(*this_point) & 0xE0) == 0xC0) {
numbytes = 2;
code_point = (this_point[0] & 0x1F) << 6;
code_point += (this_point[1] & 0x3F);
this_point += 2;
} else if (((unsigned long)(*this_point) & 0x80) == 0x00) {
numbytes = 1;
code_point = (this_point[0] & 0x7F);
this_point++;
} else {
printf("Huh? %#x\n", (int)*this_point);
printf("%d\n", *this_point & 0x80);
exit(0);
}
printf("%#x\n", code_point);
}
}
}
|