File: lang.c

package info (click to toggle)
enca 1.13-4
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 4,440 kB
  • sloc: sh: 11,125; ansic: 10,330; xml: 2,926; makefile: 698; perl: 232
file content (351 lines) | stat: -rw-r--r-- 8,651 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/*
  @(#) $Id: lang.c,v 1.18 2005/12/01 10:08:53 yeti Exp $
  uniform interface to particular languages

  Copyright (C) 2000-2003 David Necas (Yeti) <yeti@physics.muni.cz>

  This program is free software; you can redistribute it and/or modify it
  under the terms of version 2 of the GNU General Public License as published
  by the Free Software Foundation.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  more details.

  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include "enca.h"
#include "internal.h"

/**
 * Language `none'.
 *
 * This language has no regular charsets, so only multibyte encodings are
 * tested
 **/
static const EncaLanguageInfo ENCA_LANGUAGE___ = {
  "__", /* name */
  "none", /* human name */
  0,    /* number of charsets */
  NULL, /* their names */
  NULL, /* character weights */
  NULL, /* significancy data */
  NULL, /* letter data */
  NULL, /* pair data */
  0,    /* sum of weights */
  NULL, /* hook function */
  NULL, /* eolhook function */
  NULL, /* lcuchook function */
  NULL, /* ratinghook function */
};

/* All languages. */
static const EncaLanguageInfo *const LANGUAGE_LIST[] = {
  &ENCA_LANGUAGE_BE, /* Belarussian. */
  &ENCA_LANGUAGE_BG, /* Bulgarian. */
  &ENCA_LANGUAGE_CS, /* Czech. */
  &ENCA_LANGUAGE_ET, /* Estonian. */
  &ENCA_LANGUAGE_HR, /* Croatian. */
  &ENCA_LANGUAGE_HU, /* Hungarian. */
  &ENCA_LANGUAGE_LT, /* Latvian. */
  &ENCA_LANGUAGE_LV, /* Lithuanian. */
  &ENCA_LANGUAGE_PL, /* Polish. */
  &ENCA_LANGUAGE_RU, /* Russian. */
  &ENCA_LANGUAGE_SK, /* Slovak. */
  &ENCA_LANGUAGE_SL, /* Slovene. */
  &ENCA_LANGUAGE_UK, /* Ukrainian. */
  &ENCA_LANGUAGE_ZH, /* Chinese. */
  &ENCA_LANGUAGE___, /* None. */
};

#define NLANGUAGES (ELEMENTS(LANGUAGE_LIST))

/* Local prototypes. */
static int* language_charsets_ids(const EncaLanguageInfo *lang);
static const EncaLanguageInfo* find_language(const char *langname);

/**
 * enca_language_init:
 * @analyser: Analyzer state to be initialized for this language.
 * @langname: Two-letter ISO-639 language code.
 *
 * Initializes analyser for language @langname.
 *
 * Assumes @analyser is unitinialized, calling with an initialized @analyser
 * leads to memory leak.
 *
 * Returns: Nonzero on success, zero otherwise.
 **/
int
enca_language_init(EncaAnalyserState *analyser,
                   const char *langname)
{
  const EncaLanguageInfo *lang;

  assert(langname != NULL);

  analyser->lang = NULL;
  analyser->ncharsets = 0;
  analyser->charsets = NULL;
  analyser->lcbits = NULL;
  analyser->ucbits = NULL;

  lang = find_language(langname);
  if (lang == NULL)
    return 0;

  analyser->lang = lang;
  if (lang->ncharsets == 0)
    return 1;

  analyser->ncharsets = lang->ncharsets;
  analyser->charsets = language_charsets_ids(lang);

  return 1;
}

/**
 * enca_language_destroy:
 * @analyser: Analyzer state whose language part should be destroyed.
 *
 * Destroys the language part of analyser state @analyser.
 **/
void
enca_language_destroy(EncaAnalyserState *analyser)
{
  enca_free(analyser->charsets);
  enca_free(analyser->lcbits);
  enca_free(analyser->ucbits);
  analyser->ncharsets = 0;
  analyser->lang = NULL;
}

/**
 * enca_get_languages:
 * @n: The number of languages will be stored here.
 *
 * Returns list of known languages.
 *
 * The returned strings are two-letter ISO-639 language codes, the same as
 * enca_analyser_alloc() accepts.
 *
 * The list of languages has to be freed by caller; the strings themselves
 * must be considered constant and must NOT be freed.
 *
 * Returns: The list of languages, storing their number into *@n.
 **/
const char**
enca_get_languages(size_t *n)
{
  const char **languages;
  size_t i;

  languages = NEW(const char*, NLANGUAGES);
  for (i = 0; i < NLANGUAGES; i++)
    languages[i] = LANGUAGE_LIST[i]->name;

  *n = NLANGUAGES;
  return languages;
}

/**
 * enca_analyser_language:
 * @analyser: An analyser.
 *
 * Returns name of language which was @analyser initialized for.
 *
 * The returned string must be considered constant and must NOT be freed.
 *
 * Returns: The language name.
 **/
const char*
enca_analyser_language(EncaAnalyser analyser)
{
  assert(analyser != NULL);
  return analyser->lang->name;
}

/**
 * enca_language_english_name:
 * @lang: A two-letter language code, such as obtained from
 *        enca_analyser_language() or enca_get_languages().
 *
 * Returns an English name of a language given its ISO-639 code.
 *
 * The returned string must be considered constant and must NOT be freed.
 *
 * Returns: The English language name.
 **/
const char*
enca_language_english_name(const char *lang)
{
  const EncaLanguageInfo *linfo;

  linfo = find_language(lang);
  if (!linfo)
    return NULL;

  return linfo->humanname;
}

/**
 * enca_get_language_charsets:
 * @langname: Two-letter ISO-639 language code.
 * @n: The number of charsets will be stored here.
 *
 * Returns list of identifiers of charsets supported for language @language.
 *
 * The list of charset identifiers has to be freed by caller.
 *
 * Returns: The list of charsets, storing their number into *@n.  When language
 *          contains no charsets or @langname is invalid, #NULL is returned
 *          and zero stored into *@n.
 **/
int*
enca_get_language_charsets(const char *langname,
                           size_t *n)
{
  const EncaLanguageInfo *lang;

  assert(langname != NULL);

  lang = find_language(langname);
  if (lang == NULL) {
    *n = 0;
    return NULL;
  }

  *n = lang->ncharsets;
  return language_charsets_ids(lang);
}

/**
 * language_charsets_ids:
 * @lang: A language.
 *
 * Creates and fills table of charset identifiers of charsets supported for
 * language @lang.
 *
 * The size of the table is determined by @lang->ncharsets.
 *
 * Returns: The charsets id table; #NULL when @lang has no charsets.
 **/
static int*
language_charsets_ids(const EncaLanguageInfo *lang)
{
  int *charsets;
  size_t i;

  assert(lang != NULL);

  if (lang->ncharsets == 0)
    return NULL;

  charsets = NEW(int, lang->ncharsets);
  for (i = 0; i < lang->ncharsets; i++) {
    charsets[i] = enca_name_to_charset(lang->csnames[i]);
    assert(charsets[i] != ENCA_CS_UNKNOWN);
  }

  return charsets;
}

/**
 * find_language:
 * @langname: Language (i.e. locale) name.
 *
 * Finds language @langname.
 *
 * Returns: Pointer to its language information data; #NULL if not found.
 **/
static const EncaLanguageInfo*
find_language(const char *langname)
{
  const EncaLanguageInfo *lang = NULL;
  size_t i;

  if (langname == NULL)
    return NULL;

  for (i = 0; i < NLANGUAGES; i++) {
    if (strcmp(langname, LANGUAGE_LIST[i]->name) == 0) {
      lang = LANGUAGE_LIST[i];
      break;
    }
  }

  return lang;
}

/**
 * enca_get_charset_similarity_matrix:
 * @lang: A language.
 *
 * Computes character weight similarity matrix for language @lang.
 *
 * sim[i,j] is normalized to sim[i,i] thus:
 * - a row i contains ,probabilities` different languages will look like the
 *   i-th one
 * - a column i contains ,probabilities` the i-th language will look like
 *   the other languages.
 *
 * For all practical applications, the higher one of sim[i,j] and sim[j,i]
 * is important.
 *
 * Note: this is not used anywhere, only by simtable.
 *
 * Returns: The matrix, its size is determined by @lang->ncharsets; #NULL
 *          for language with no charsets.
 **/
double*
enca_get_charset_similarity_matrix(const EncaLanguageInfo *lang)
{
  const size_t n = lang->ncharsets;
  const unsigned short int *const *w = lang->weights;
  const unsigned short int *s = lang->significant;

  double *smat;
  size_t i, j, c;

  assert(lang != NULL);

  if (n == 0)
    return NULL;

  /* Below diagonal. */
  smat = NEW(double, n*n);
  for (i = 0; i < n; i++) {
    for (j = 0; j <= i; j++) {
      smat[i*n + j] = 0.0;
      for (c = 0; c < 0x100; c++)
        smat[i*n + j] += (double)w[i][c] * (double)w[j][c] / (s[c] + EPSILON);
    }
  }

  /* Above diagonal. */
  for (i = 0; i < n; i++) {
    for (j = i+1; j < n; j++)
      smat[i*n + j] = smat[j*n + i];
  }

  /* Normalize. */
  for (i = 0; i < n; i++) {
    double wmax = smat[i*n + i];

    for (j = 0; j < n; j++) {
      smat[i*n + j] /= wmax;
    }
  }

  return smat;
}
/* vim: ts=2
 */