1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
|
/* $XTermId: charclass.c,v 1.22 2009/11/05 23:46:15 tom Exp $ */
/*
* Compact and efficient reimplementation of the
* xterm character class mechanism for large character sets
*
* Markus Kuhn -- mkuhn@acm.org -- 2000-07-03
*
* Xterm allows users to select entire words with a double-click on the left
* mouse button. Opinions might differ on what type of characters are part of
* separate words, therefore xterm allows users to configure a class code for
* each 8-bit character. Words are maximum length sequences of neighboring
* characters with identical class code. Extending this mechanism to Unicode
* naively would create an at least 2^16 entries (128 kB) long class code
* table.
*
* Instead, we transform the character class table into a list of intervals,
* that will be accessed via a linear search. Changes made to the table by the
* user will be appended. A special class code IDENT (default) marks
* characters who have their code number as the class code.
*
* We could alternatively use a sorted table of non-overlapping intervals that
* can be accessed via binary search, but merging in new intervals is
* significantly more hassle and not worth the effort here.
*/
#include <xterm.h>
#include <charclass.h>
#if OPT_WIDE_CHARS
static struct classentry {
int cclass;
int first;
int last;
} *classtab;
/*
* Special convention for classtab[0]:
* - classtab[0].cclass is the allocated number of entries in classtab
* - classtab[0].first = 1 (first used entry in classtab)
* - classtab[0].last is the last used entry in classtab
*/
int
SetCharacterClassRange(int low, int high, int value)
{
if (high < low)
return -1; /* nothing to do */
/* make sure we have at least one free entry left at table end */
if (classtab[0].last > classtab[0].cclass - 2) {
classtab[0].cclass += 5 + classtab[0].cclass / 4;
classtab = TypeRealloc(struct classentry,
(unsigned) classtab[0].cclass, classtab);
if (!classtab)
abort();
}
/* simply append new interval to end of interval array */
classtab[0].last++;
classtab[classtab[0].last].first = low;
classtab[classtab[0].last].last = high;
classtab[classtab[0].last].cclass = value;
return 0;
}
typedef enum {
IDENT = -1,
ALNUM = 48,
CNTRL = 1,
BLANK = 32
} Classes;
void
init_classtab(void)
{
const int size = 50;
classtab = TypeMallocN(struct classentry, (unsigned) size);
if (!classtab)
abort();
classtab[0].cclass = size;
classtab[0].first = 1;
classtab[0].last = 0;
/* old xterm default classes */
SetCharacterClassRange(0, 0, BLANK);
SetCharacterClassRange(1, 31, CNTRL);
SetCharacterClassRange('\t', '\t', BLANK);
SetCharacterClassRange('0', '9', ALNUM);
SetCharacterClassRange('A', 'Z', ALNUM);
SetCharacterClassRange('_', '_', ALNUM);
SetCharacterClassRange('a', 'z', ALNUM);
SetCharacterClassRange(127, 159, CNTRL);
SetCharacterClassRange(160, 191, IDENT);
SetCharacterClassRange(192, 255, ALNUM);
SetCharacterClassRange(215, 215, IDENT);
SetCharacterClassRange(247, 247, IDENT);
/* added Unicode classes */
SetCharacterClassRange(0x0100, 0xffdf, ALNUM); /* mostly characters */
SetCharacterClassRange(0x037e, 0x037e, IDENT); /* Greek question mark */
SetCharacterClassRange(0x0387, 0x0387, IDENT); /* Greek ano teleia */
SetCharacterClassRange(0x055a, 0x055f, IDENT); /* Armenian punctuation */
SetCharacterClassRange(0x0589, 0x0589, IDENT); /* Armenian full stop */
SetCharacterClassRange(0x0700, 0x070d, IDENT); /* Syriac punctuation */
SetCharacterClassRange(0x104a, 0x104f, IDENT); /* Myanmar punctuation */
SetCharacterClassRange(0x10fb, 0x10fb, IDENT); /* Georgian punctuation */
SetCharacterClassRange(0x1361, 0x1368, IDENT); /* Ethiopic punctuation */
SetCharacterClassRange(0x166d, 0x166e, IDENT); /* Canadian Syl. punctuation */
SetCharacterClassRange(0x17d4, 0x17dc, IDENT); /* Khmer punctuation */
SetCharacterClassRange(0x1800, 0x180a, IDENT); /* Mongolian punctuation */
SetCharacterClassRange(0x2000, 0x200a, BLANK); /* spaces */
SetCharacterClassRange(0x200b, 0x27ff, IDENT); /* punctuation and symbols */
SetCharacterClassRange(0x2070, 0x207f, 0x2070); /* superscript */
SetCharacterClassRange(0x2080, 0x208f, 0x2080); /* subscript */
SetCharacterClassRange(0x3000, 0x3000, BLANK); /* ideographic space */
SetCharacterClassRange(0x3001, 0x3020, IDENT); /* ideographic punctuation */
SetCharacterClassRange(0x3040, 0x309f, 0x3040); /* Hiragana */
SetCharacterClassRange(0x30a0, 0x30ff, 0x30a0); /* Katakana */
SetCharacterClassRange(0x3300, 0x9fff, 0x4e00); /* CJK Ideographs */
SetCharacterClassRange(0xac00, 0xd7a3, 0xac00); /* Hangul Syllables */
SetCharacterClassRange(0xf900, 0xfaff, 0x4e00); /* CJK Ideographs */
SetCharacterClassRange(0xfe30, 0xfe6b, IDENT); /* punctuation forms */
SetCharacterClassRange(0xff00, 0xff0f, IDENT); /* half/fullwidth ASCII */
SetCharacterClassRange(0xff1a, 0xff20, IDENT); /* half/fullwidth ASCII */
SetCharacterClassRange(0xff3b, 0xff40, IDENT); /* half/fullwidth ASCII */
SetCharacterClassRange(0xff5b, 0xff64, IDENT); /* half/fullwidth ASCII */
return;
}
int
CharacterClass(int c)
{
int i, cclass = IDENT;
for (i = classtab[0].first; i <= classtab[0].last; i++)
if (classtab[i].first <= c && classtab[i].last >= c)
cclass = classtab[i].cclass;
if (cclass < 0)
cclass = c;
return cclass;
}
#ifdef NO_LEAKS
void
noleaks_CharacterClass(void)
{
if (classtab != 0) {
free(classtab);
classtab = 0;
}
}
#endif
#endif /* OPT_WIDE_CHARS */
|