File: UnicodeCharacter.t

package info (click to toggle)
tom 1.1.1-2
links: PTS
area: main
in suites: potato
size: 6,340 kB
ctags: 2,244
sloc: objc: 27,863; ansic: 9,804; sh: 7,411; yacc: 3,377; lex: 966; asm: 208; makefile: 62; cpp: 10
file content (254 lines) | stat: -rw-r--r-- 6,744 bytes
<copyright> UnicodeCharacter class.
    Written by Pieter J. Schoenmakers <tiggr@ics.ele.tue.nl>

    Copyright (C) 1996 Pieter J. Schoenmakers.

    This file is part of TOM.  TOM is distributed under the terms of the
    TOM License, a copy of which can be found in the TOM distribution; see
    the file LICENSE.

    <id>$Id: UnicodeCharacter.t,v 1.12 1998/01/05 00:56:01 tiggr Exp $</id>
    </copyright>

implementation class
UnicodeCharacter: State
{
  <doc> The number of fields per character in the Unicode 2.0 Data file.
      The actual semantics of the fields is burried in the {initWith}
      instance method.  </doc>
  const NUM_FIELDS = 15;

  <doc> Uniqued strings we'll be needing.  </doc>
  static UniqueString cat_Mn, cat_Mc;
  static UniqueString cat_Nd, cat_No;
  static UniqueString cat_Zs, cat_Zl, cat_Zp;
  static UniqueString cat_Cc, cat_Co, cat_Cn;
  static UniqueString cat_Lu, cat_Ll, cat_Lt, cat_Lm, cat_Lo;
  static UniqueString cat_Pd, cat_Ps, cat_Pe, cat_Po;
  static UniqueString cat_Sm, cat_Sc, cat_So;
}

<doc> Setup the unique category strings.  </doc>
redefine void
  load Array arguments
{
  cat_Mn = [UniqueString with "Mn"];
  cat_Mc = [UniqueString with "Mc"];
  cat_Nd = [UniqueString with "Nd"];
  cat_No = [UniqueString with "No"];
  cat_Zs = [UniqueString with "Zs"];
  cat_Zl = [UniqueString with "Zl"];
  cat_Zp = [UniqueString with "Zp"];
  cat_Cc = [UniqueString with "Cc"];
  cat_Co = [UniqueString with "Co"];
  cat_Cn = [UniqueString with "Cn"];
  cat_Lu = [UniqueString with "Lu"];
  cat_Ll = [UniqueString with "Ll"];
  cat_Lt = [UniqueString with "Lt"];
  cat_Lm = [UniqueString with "Lm"];
  cat_Lo = [UniqueString with "Lo"];
  cat_Pd = [UniqueString with "Pd"];
  cat_Ps = [UniqueString with "Ps"];
  cat_Pe = [UniqueString with "Pe"];
  cat_Po = [UniqueString with "Po"];
  cat_Sm = [UniqueString with "Sm"];
  cat_Sc = [UniqueString with "Sc"];
  cat_So = [UniqueString with "So"];
}

<doc> Return a newly allocated {UnicodeCharacter} initialized from the
    {line}, which should be a line, sans termination, from the Unicode 2.0
    Data file.  </doc>
instance (id)
  with String line
{
  = [[self alloc] initWith line];
}

end;

implementation instance
UnicodeCharacter
{
  <doc> The Unicode (unique).  </doc>
  public char unicode;

  <doc> The (Unicode 2.0) name.  </doc>
  public String name;

  <doc> The general category.  </doc>
  public String category;

  <doc> The canonical combining class.  </doc>
  public int combining_class;

  <doc> The bidirectional category.  </doc>
  public String direction_category;

  <doc> Character decomposition.  </doc>
  public String decomposition;

  <doc> Decimal digit value, digit value, numeric value, or -1 if not
      applicable.  </doc>
  public int decimal_value, digit_value;
  public String numeric_value;

  <doc> Whether the character is marked mirrored in bidirectional text.  </doc>
  public boolean mirrored;

  <doc> The old (Unicode 1.0) name.  </doc>
  public String old_name;

  <doc> The 10646 comment field.  </doc>
  public String comment;

  <doc> The upper, lower, and title case equivalents, or 0x0000 if not
      applicable.  </doc>
  public char upper, lower, title;
}

redefine int
  hash
{
  = unicode;
}

redefine boolean
  equal id other
{
  = unicode == [other unicode];
}

<doc> Designated initializer.  Initialize from the {line}.  </doc>
id
  initWith String line
{
  MutableArray fields = [line componentsSeparatedBy ';'];
  int i, n = [fields length];
  boolean b;
  int v;

  if (n != NUM_FIELDS)
    return nil;

  (v, b, i) = [fields[0] integerValue (0, -1) defaultBase: 16 allowSign: NO
		         allowCBases: NO baseSeparator: byte (0)];
  if (!b || i != 4 || v < 0 || v >= 0x10000)
    return nil;
  unicode = char (v);

  name = [UniqueString with fields[1]];
  category = [UniqueString with fields[2]];

  (v, b, i) = [fields[3] integerValue (0, -1) defaultBase: 10 allowSign: NO
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  combining_class = int (v);

  direction_category = [UniqueString with fields[4]];
  decomposition = [UniqueString with fields[5]];

  (v, b, i) = [fields[6] integerValue (0, -1) defaultBase: 10
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  decimal_value = int (v);

  (v, b, i) = [fields[7] integerValue (0, -1) defaultBase: 10
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  digit_value = int (v);

  numeric_value = [UniqueString with fields[8]];

  if (["Y" equal fields[9]])
    mirrored = TRUE;
  else if (!["N" equal fields[9]])
    return nil;

  old_name = [UniqueString with fields[10]];
  comment = [UniqueString with fields[11]];

  (v, b, i) = [fields[12] integerValue (0, -1) defaultBase: 16 allowSign: NO
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  upper = char (v);

  (v, b, i) = [fields[13] integerValue (0, -1) defaultBase: 16 allowSign: NO
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  lower = char (v);

  (v, b, i) = [fields[14] integerValue (0, -1) defaultBase: 16 allowSign: NO
			 allowCBases: NO baseSeparator: byte (0)];
  if (!b)
    return nil;
  title = char (v);

  = self;
}

<doc> Return {TRUE} if the character is a decimal digit character.  </doc>
boolean
  isDigit
{
  = category == cat_Nd;
}

<doc> Return {TRUE} if the character is a letter.  </doc>
boolean
  isLetter
{
  /* We know we can use `==' here because we know the {cat_Lu} and the
     {category} are both unique strings we created.  This is true for all
     strings starting with `cat'.  */
  = category == cat_Lu || category == cat_Ll || category == cat_Lt
    || category == cat_Lm || category == cat_Lo;
}

<doc> Return {TRUE} if the character is in lower case, which it is, if it
    has an upper case equivalent.  </doc>
boolean
  isLower
{
  = upper != char (0);
}

<doc> Return {TRUE} if the character is a numeric character.  </doc>
boolean
  isNumeric
{
  = category == cat_Nd || category == cat_No;
}

<doc> Return {TRUE} if the character is punctuation.  </doc>
boolean
  isPunctuation
{
  = category == cat_Pd || category == cat_Ps || category == cat_Pe
    || category == cat_Po;
}

<doc> Return {TRUE} if the character is space.  In addition to the
    information in the UnicodeData file (category Zs), the characters 9
    (tab), 10 (linefeed), 11 (vertical tab), 12 (form feed), and 13
    (carriage return) are considered space too.  </doc>
boolean
  isSpace
{
  = category == cat_Zs || (unicode >= char (9) && unicode <= char (13));
}

<doc> Return {TRUE} if the character is in upper case, which it is, if it
    has a lower case equivalent.  </doc>
boolean
  isUpper
{
  = lower != char (0);
}

end;