File: pair.c

package info (click to toggle)
enca 1.21-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,948 kB
  • sloc: ansic: 10,297; sh: 5,858; xml: 2,132; makefile: 700; perl: 261
file content (237 lines) | stat: -rw-r--r-- 6,545 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/*
  pair-frequency based tests (used for 8bit-dense languages)

  Copyright (C) 2003 David Necas (Yeti) <yeti@physics.muni.cz>

  This program is free software; you can redistribute it and/or modify it
  under the terms of version 2 of the GNU General Public License as published
  by the Free Software Foundation.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  more details.

  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif /* HAVE_CONFIG_H */

#include <stdlib.h>
#include <math.h>

#include "enca.h"
#include "internal.h"

/* Local prototypes. */
static size_t count_all_8bit_pairs(EncaAnalyserState *analyser);
static void compute_pair2bits(EncaAnalyserState *analyser);
static void count_good_pairs(EncaAnalyserState *analyser);

/**
 * enca_pair_init:
 * @analyser: Analyzer state to be initialized.
 *
 * Initializes pair statistics data.
 *
 * In fact it just sets everything to #NULL, to be initialized when needed.
 **/
void
enca_pair_init(EncaAnalyserState *analyser)
{
  analyser->bitcounts = NULL;
  analyser->pairratings = NULL;
  analyser->pair2bits = NULL;
}

/**
 * enca_pair_destroy:
 * @analyser: Analyzer state whose pair statistics part should be destroyed.
 *
 * Destroys the pair statistics part of analyser state @analyser.
 **/
void
enca_pair_destroy(EncaAnalyserState *analyser)
{
  enca_free(analyser->pair2bits);
  enca_free(analyser->bitcounts);
  enca_free(analyser->pairratings);
}

/**
 * enca_pair_analyse:
 * @analyser: Analysed containing the sample for pair frequency analysis.
 *
 * Performs pair-frequency based analysis, provided that the language supports
 * it (does nothing otherwise).
 *
 * Returns: Nonzero when the character set was succesfully determined,
 *          @analyser->@result.@charset is then directly modified.
 **/
int
enca_pair_analyse(EncaAnalyserState *analyser)
{
  const unsigned char *const *letters = analyser->lang->letters;
  const unsigned char **const *pairs = analyser->lang->pairs;
  size_t ncharsets = analyser->ncharsets;
  size_t i, best, all8bitpairs;
  double q;

  if (!letters || !pairs)
    return 0;

  if (!analyser->pairratings)
    analyser->pairratings = NEW(size_t, ncharsets);

  /* count the good pairs and find winner
   * initialize when we are called the first time */
  if (!analyser->pair2bits) {
    compute_pair2bits(analyser);
    analyser->bitcounts = NEW(size_t, 1 << ncharsets);
  }
  memset(analyser->pairratings, 0, ncharsets*sizeof(size_t));
  all8bitpairs = count_all_8bit_pairs(analyser);
  count_good_pairs(analyser);

  best = 0;
  for (i = 1; i < ncharsets; i++) {
    if (analyser->pairratings[i] > analyser->pairratings[best])
      best = i;
  }

  /* Just a Right Value */
  q = 1.0 - exp(3.0*(1.0 - analyser->options.threshold));
  if (analyser->pairratings[best] >= analyser->options.min_chars
      && analyser->pairratings[best] >= q*all8bitpairs) {
    analyser->result.charset = analyser->charsets[best];
    return 1;
  }

  /* I don't like saying it, but the sample seems to be garbage... */
  return 0;
}

/**
 * count_all_8bit_pairs:
 * @analyser: An analyser.
 *
 * Count all pairs containing at least one 8bit characters.
 *
 * Returns: The number of such pairs.
 **/
static size_t
count_all_8bit_pairs(EncaAnalyserState *analyser)
{
  unsigned char *buffer = analyser->buffer;
  size_t size = analyser->size;
  size_t i, c, sum8bits;

  sum8bits = 0;
  c = FILL_NONLETTER;
  for (i = size; i; i--) {
    if ((c | *buffer) & 0x80)
      sum8bits++;
    c = *(buffer++);
  }
  if (size && (c & 0x80))
    sum8bits++;

  return sum8bits;
}

/**
 * count_good_pairs:
 * @analyser: An analyser.
 *
 * Count `good' pairs for each charset.
 *
 * Makes use of @analyser->pair2bits.  See compute_pair2bits() comment for
 * description of how it works.
 **/
static void
count_good_pairs(EncaAnalyserState *analyser)
{
  size_t *ratings = analyser->pairratings;
  unsigned char *pair2bits = analyser->pair2bits;
  size_t *bitcounts = analyser->bitcounts;
  size_t ncharsets = analyser->ncharsets;
  const unsigned char *buffer = analyser->buffer;
  size_t size = analyser->size;
  size_t i, j, c, cs;

  assert(ncharsets <= 8);
  assert(pair2bits);
  assert(bitcounts);
  assert(ratings);

  memset(bitcounts, 0, (1 << ncharsets)*sizeof(size_t));
  c = FILL_NONLETTER << 8;
  for (i = size; i; i--) {
    bitcounts[pair2bits[c | *buffer]]++;
    c = *(buffer++) << 8;
  }
  if (size)
    bitcounts[pair2bits[c | FILL_NONLETTER]]++;

  memset(ratings, 0, ncharsets*sizeof(size_t));
  for (cs = 0; cs < ncharsets; cs++) {
    size_t bit = 1 << cs;
    size_t rating = 0;

    for (i = 0; i < (1U << ncharsets); i += 2*bit) {
      for (j = i+bit; j < i+2*bit; j++)
        rating += bitcounts[j];
    }
    ratings[cs] = rating;
  }
}

/**
 * compute_pair2bits:
 * @analyser: An analyser.
 *
 * Allocate and fill @analyser->@pair2bits.
 *
 * The meaning of pair2bits is following: it's indexed by pairs (0x100*first
 * + second) and each @pair2bits element has set a bit corresponding to a
 * charset when the pair is `good' for the charset.
 *
 * To determine `good' pair counts for all charsets we don't have to count
 * the pairs, we only have to count the bit combinations (@bitcounts) and the
 * per charset pair counts can be easily constructed from them -- by summing
 * those with the particular bit set.
 **/
static void
compute_pair2bits(EncaAnalyserState *analyser)
{
  size_t ncharsets = analyser->ncharsets;
  size_t cs, c;

  assert(analyser->pair2bits == NULL);
  assert(analyser->ncharsets <= 8);

  analyser->pair2bits = NEW(unsigned char, 0x10000);
  memset(analyser->pair2bits, 0, 0x10000);
  for (cs = 0; cs < ncharsets; cs++) {
    const unsigned char *letters = analyser->lang->letters[cs];
    const unsigned char *const *pairs = analyser->lang->pairs[cs];
    size_t bit = 1 << cs;

    for (c = 0; c < 0x100; c++) {
      size_t j = letters[c];

      if (j != 255) {
        const unsigned char *s = pairs[j];
        unsigned char *p = analyser->pair2bits + (c << 8);

        /* set the corresponding bit for all pairs */
        do {
          p[(size_t)*(s++)] |= bit;
        } while (*s);
      }
    }
  }
}