File: UnicodeCharacter.t

package info (click to toggle)
tom 1.1.1-2
  • links: PTS
  • area: main
  • in suites: potato
  • size: 6,340 kB
  • ctags: 2,244
  • sloc: objc: 27,863; ansic: 9,804; sh: 7,411; yacc: 3,377; lex: 966; asm: 208; makefile: 62; cpp: 10
file content (254 lines) | stat: -rw-r--r-- 6,744 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
<copyright> UnicodeCharacter class.
    Written by Pieter J. Schoenmakers <tiggr@ics.ele.tue.nl>

    Copyright (C) 1996 Pieter J. Schoenmakers.

    This file is part of TOM.  TOM is distributed under the terms of the
    TOM License, a copy of which can be found in the TOM distribution; see
    the file LICENSE.

    <id>$Id: UnicodeCharacter.t,v 1.12 1998/01/05 00:56:01 tiggr Exp $</id>
    </copyright>

implementation class
UnicodeCharacter: State
{
  <doc> The number of fields per character in the Unicode 2.0 Data file.
      The actual semantics of the fields is burried in the {initWith}
      instance method.  </doc>
  const NUM_FIELDS = 15;

  <doc> Uniqued strings we'll be needing.  </doc>
  static UniqueString cat_Mn, cat_Mc;
  static UniqueString cat_Nd, cat_No;
  static UniqueString cat_Zs, cat_Zl, cat_Zp;
  static UniqueString cat_Cc, cat_Co, cat_Cn;
  static UniqueString cat_Lu, cat_Ll, cat_Lt, cat_Lm, cat_Lo;
  static UniqueString cat_Pd, cat_Ps, cat_Pe, cat_Po;
  static UniqueString cat_Sm, cat_Sc, cat_So;
}

<doc> Setup the unique category strings.  </doc>
redefine void
  load Array arguments
{
  cat_Mn = [UniqueString with "Mn"];
  cat_Mc = [UniqueString with "Mc"];
  cat_Nd = [UniqueString with "Nd"];
  cat_No = [UniqueString with "No"];
  cat_Zs = [UniqueString with "Zs"];
  cat_Zl = [UniqueString with "Zl"];
  cat_Zp = [UniqueString with "Zp"];
  cat_Cc = [UniqueString with "Cc"];
  cat_Co = [UniqueString with "Co"];
  cat_Cn = [UniqueString with "Cn"];
  cat_Lu = [UniqueString with "Lu"];
  cat_Ll = [UniqueString with "Ll"];
  cat_Lt = [UniqueString with "Lt"];
  cat_Lm = [UniqueString with "Lm"];
  cat_Lo = [UniqueString with "Lo"];
  cat_Pd = [UniqueString with "Pd"];
  cat_Ps = [UniqueString with "Ps"];
  cat_Pe = [UniqueString with "Pe"];
  cat_Po = [UniqueString with "Po"];
  cat_Sm = [UniqueString with "Sm"];
  cat_Sc = [UniqueString with "Sc"];
  cat_So = [UniqueString with "So"];
}

<doc> Return a newly allocated {UnicodeCharacter} initialized from the
    {line}, which should be a line, sans termination, from the Unicode 2.0
    Data file.  </doc>
instance (id)
  with String line
{
  = [[self alloc] initWith line];
}

end;

implementation instance
UnicodeCharacter
{
  <doc> The Unicode (unique).  </doc>
  public char unicode;

  <doc> The (Unicode 2.0) name.  </doc>
  public String name;

  <doc> The general category.  </doc>
  public String category;

  <doc> The canonical combining class.  </doc>
  public int combining_class;

  <doc> The bidirectional category.  </doc>
  public String direction_category;

  <doc> Character decomposition.  </doc>
  public String decomposition;

  <doc> Decimal digit value, digit value, numeric value, or -1 if not
      applicable.  </doc>
  public int decimal_value, digit_value;
  public String numeric_value;

  <doc> Whether the character is marked mirrored in bidirectional text.  </doc>
  public boolean mirrored;

  <doc> The old (Unicode 1.0) name.  </doc>
  public String old_name;

  <doc> The 10646 comment field.  </doc>
  public String comment;

  <doc> The upper, lower, and title case equivalents, or 0x0000 if not
      applicable.  </doc>
  public char upper, lower, title;
}

redefine int
  hash
{
  = unicode;
}

redefine boolean
  equal id other
{
  = unicode == [other unicode];
}

<doc> Designated initializer.  Initialize from the {line}.  </doc>
id
  initWith String line
{
  MutableArray fields = [line componentsSeparatedBy ';'];
  int i, n = [fields length];
  boolean b;
  int v;

  if (n != NUM_FIELDS)
    return nil;

  (v, b, i) = [fields[0] integerValue (0, -1) defaultBase: 16 allowSign: NO
		         allowCBases: NO baseSeparator: byte (0)];
  if (!b || i != 4 || v < 0 || v >= 0x10000)
    return nil;
  unicode = char (v);

  name = [UniqueString with fields[1]];
  category = [UniqueString with fields[2]];

  (v, b, i) = [fields[3] integerValue (0, -1) defaultBase: 10 allowSign: NO
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  combining_class = int (v);

  direction_category = [UniqueString with fields[4]];
  decomposition = [UniqueString with fields[5]];

  (v, b, i) = [fields[6] integerValue (0, -1) defaultBase: 10
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  decimal_value = int (v);

  (v, b, i) = [fields[7] integerValue (0, -1) defaultBase: 10
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  digit_value = int (v);

  numeric_value = [UniqueString with fields[8]];

  if (["Y" equal fields[9]])
    mirrored = TRUE;
  else if (!["N" equal fields[9]])
    return nil;

  old_name = [UniqueString with fields[10]];
  comment = [UniqueString with fields[11]];

  (v, b, i) = [fields[12] integerValue (0, -1) defaultBase: 16 allowSign: NO
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  upper = char (v);

  (v, b, i) = [fields[13] integerValue (0, -1) defaultBase: 16 allowSign: NO
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  lower = char (v);

  (v, b, i) = [fields[14] integerValue (0, -1) defaultBase: 16 allowSign: NO
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  title = char (v);

  = self;
}

<doc> Return {TRUE} if the character is a decimal digit character.  </doc>
boolean
  isDigit
{
  = category == cat_Nd;
}

<doc> Return {TRUE} if the character is a letter.  </doc>
boolean
  isLetter
{
  /* We know we can use `==' here because we know the {cat_Lu} and the
     {category} are both unique strings we created.  This is true for all
     strings starting with `cat'.  */
  = category == cat_Lu || category == cat_Ll || category == cat_Lt
    || category == cat_Lm || category == cat_Lo;
}

<doc> Return {TRUE} if the character is in lower case, which it is, if it
    has an upper case equivalent.  </doc>
boolean
  isLower
{
  = upper != char (0);
}

<doc> Return {TRUE} if the character is a numeric character.  </doc>
boolean
  isNumeric
{
  = category == cat_Nd || category == cat_No;
}

<doc> Return {TRUE} if the character is punctuation.  </doc>
boolean
  isPunctuation
{
  = category == cat_Pd || category == cat_Ps || category == cat_Pe
    || category == cat_Po;
}

<doc> Return {TRUE} if the character is space.  In addition to the
    information in the UnicodeData file (category Zs), the characters 9
    (tab), 10 (linefeed), 11 (vertical tab), 12 (form feed), and 13
    (carriage return) are considered space too.  </doc>
boolean
  isSpace
{
  = category == cat_Zs || (unicode >= char (9) && unicode <= char (13));
}

<doc> Return {TRUE} if the character is in upper case, which it is, if it
    has a lower case equivalent.  </doc>
boolean
  isUpper
{
  = lower != char (0);
}

end;