1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
|
/*
language info: chinese
Copyright (C) 2005 Meng Jie (Zuxy) <zuxy.meng@gmail.com>
This program is free software; you can redistribute it and/or modify it
under the terms of version 2 of the GNU General Public License as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif /* HAVE_CONFIG_H */
#include "enca.h"
#include "internal.h"
#include "data/chinese/chinese.h"
static int hook(EncaAnalyserState *analyser);
static int calc_rating(EncaAnalyserState *analyser);
/* Not 8-bit clean, can't be a HZ here */
static int is_hz(const unsigned char* str) { return 0; }
static const char *const CHARSET_NAMES[] = {
"gbk",
"big5",
"hz"
};
static ValidityFunc* validity_check_table[] = {
is_gbk,
is_big5,
is_hz
};
static RateFunc* rate_calc_table[] = {
in_gbk,
in_big5,
NULL
};
#define NCHARSETS (sizeof(CHARSET_NAMES)/sizeof(const char* const))
/**
* ENCA_LANGUAGE_ZH:
*
* Chinese language.
*
* Everything the world out there needs to know about this language.
**/
const EncaLanguageInfo ENCA_LANGUAGE_ZH = {
"zh",
"chinese",
NCHARSETS,
CHARSET_NAMES,
0,
0,
0,
0,
0,
&hook,
NULL,
NULL,
&calc_rating
};
/**
* hook:
* @analyser: Analyser state whose charset ratings are to be modified.
*
* Adjust ratings for language "zh", see calc_rating below.
*
* Returns: Nonzero if charset ratigns have been actually modified, zero
* otherwise.
**/
static int
hook(EncaAnalyserState *analyser)
{
const size_t* order = analyser->order;
double* rating_first = &analyser->ratings[order[0]];
double* rating_second = &analyser->ratings[order[1]];
if (*rating_second < 0) {
*rating_second = 0.;
if (*rating_first < 0)
*rating_first = 0.;
else
*rating_first = 1.; /* Make sure that the first won */
return 1;
}
return 0;
}
/**
* calc_rating:
* @analyser: An analyser.
*
* Calculating ratings for GBK and Big5, respectively, and
* ratings may be set to negative values when invalid a character
* for a charset was encoutered. This should not affect the result of
* enca_find_max_sec, but must be adjust to positive by hook for
* the final comparison.
*
* Returns: Always return 1
**/
static int calc_rating(EncaAnalyserState *analyser)
{
int islowbyte = 0;
unsigned int i, j;
unsigned char low;
const size_t size = analyser->size;
const unsigned char *buffer = analyser->buffer;
double *ratings = analyser->ratings;
int continue_check[NCHARSETS];
const struct zh_weight* pweight;
assert(analyser->ncharsets == NCHARSETS
&& sizeof(rate_calc_table)/sizeof(RateFunc*) == NCHARSETS
&& sizeof(validity_check_table)/sizeof(ValidityFunc*) == NCHARSETS);
for (i = 0; i < NCHARSETS; i++) {
continue_check[i] = 1;
ratings[i] = 0.;
}
for (i = 0; i < size; i++) {
low = buffer[i];
/* low byte */
if (islowbyte) {
const unsigned char* hanzi = buffer + i - 1;
assert(i);
for (j = 0; j < NCHARSETS; j++) {
if (continue_check[j]) {
continue_check[j] = validity_check_table[j](hanzi);
if (!continue_check[j])
ratings[j] = -1.;
else {
pweight = rate_calc_table[j](hanzi);
if (pweight)
ratings[j] += pweight->freq;
}
}
}
islowbyte = 0;
continue;
}
if (low & 0x80)
islowbyte = 1;
}
#ifdef DEBUG
printf("GBK: %f, BIG5: %f\n", ratings[0], ratings[1]);
#endif
/* Unfinished DBCS. */
if (islowbyte && analyser->options.termination_strictness > 0)
{
for (i = 0; i < NCHARSETS; i++)
ratings[i] = 0.;
}
return 1;
}
/* vim: ts=2
*/
|