File: transliterator.cpp

package info (click to toggle)
ruby-charlock-holmes 0.7.9-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 116 kB
  • sloc: ansic: 325; ruby: 153; cpp: 101; sh: 21; makefile: 4
file content (130 lines) | stat: -rw-r--r-- 3,304 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include "common.h"
#undef UChar

#include <string>
#include <unicode/translit.h>

extern "C" {

#ifdef HAVE_RUBY_ENCODING_H
#include <ruby/encoding.h>
static VALUE rb_eEncodingCompatibilityError;

static void check_utf8_encoding(VALUE str) {
  static rb_encoding *_cached[3] = {NULL, NULL, NULL};
  rb_encoding *enc;

  if (_cached[0] == NULL) {
    _cached[0] = rb_utf8_encoding();
    _cached[1] = rb_usascii_encoding();
    _cached[2] = rb_ascii8bit_encoding();
  }

  enc = rb_enc_get(str);
  if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) {
    rb_raise(rb_eEncodingCompatibilityError,
      "Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc));
  }
}

#else
static void check_utf8_encoding(VALUE str) {}
#endif

extern VALUE rb_mCharlockHolmes;
static VALUE rb_cTransliterator;

static VALUE rb_transliterator_id_list(VALUE self) {
  UErrorCode status = U_ZERO_ERROR;
  icu::StringEnumeration *id_list;
  int32_t id_list_size;
  const char *curr_id;
  int32_t curr_id_len;
  VALUE rb_ary;
  VALUE rb_curr_id;

  id_list_size = 0;
  id_list = icu::Transliterator::getAvailableIDs(status);
  if(!U_SUCCESS(status)) {
    rb_raise(rb_eArgError, "%s", u_errorName(status));
  }

  status = U_ZERO_ERROR;
  id_list_size = id_list->count(status);
  if(!U_SUCCESS(status)) {
    rb_raise(rb_eArgError, "%s", u_errorName(status));
  }

  rb_ary = rb_ary_new2(id_list_size);

  do {
    curr_id_len = 0;
    curr_id = id_list->next(&curr_id_len, status);
    if(!U_SUCCESS(status)) {
      rb_raise(rb_eArgError, "%s", u_errorName(status));
    }

    if (curr_id != NULL) {
      rb_curr_id = charlock_new_str(curr_id, curr_id_len);
      rb_ary_push(rb_ary, rb_curr_id);
    }
  } while(curr_id != NULL);

  delete id_list;

  return rb_ary;
}

static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) {
  UErrorCode status = U_ZERO_ERROR;
  UParseError p_error;
  icu::Transliterator *trans;
  const char *txt;
  size_t txt_len;
  const char *id;
  size_t id_len;
  icu::UnicodeString *u_txt;
  std::string result;
  VALUE rb_out;

  Check_Type(rb_txt, T_STRING);
  Check_Type(rb_id, T_STRING);

  check_utf8_encoding(rb_txt);
  check_utf8_encoding(rb_id);

  txt = RSTRING_PTR(rb_txt);
  txt_len = RSTRING_LEN(rb_txt);
  id = RSTRING_PTR(rb_id);
  id_len = RSTRING_LEN(rb_id);

  trans = icu::Transliterator::createInstance(icu::UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status);
  if(!U_SUCCESS(status)) {
    rb_raise(rb_eArgError, "%s", u_errorName(status));
  }

  u_txt = new icu::UnicodeString(txt, txt_len);
  trans->transliterate(*u_txt);
  icu::StringByteSink<std::string> sink(&result);
  u_txt->toUTF8(sink);

  delete u_txt;
  delete trans;

  rb_out = charlock_new_str(result.data(), result.length());

  return rb_out;
}

void _init_charlock_transliterator(void) {
#ifdef HAVE_RUBY_ENCODING_H
  rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
#endif

  rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject);

  rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0);
  rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2);
}

}