File: enca.h

package info (click to toggle)
enca 1.21-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 4,948 kB
  • sloc: ansic: 10,297; sh: 5,858; xml: 2,132; makefile: 700; perl: 261
file content (223 lines) | stat: -rw-r--r-- 8,442 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* This header file is in the public domain. */
#ifndef ENCA_H
#define ENCA_H

#include <stdlib.h>
/* According to autoconf stdlib may not be enough for size_t */
#include <sys/types.h>

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

/**
 * SECTION:enums
 * @short_description: Enca library typedefs, enums and constants.
 *
 */
/* Enumerated types */

typedef enum { /*< flags >*/
  ENCA_SURFACE_EOL_CR    = 1 << 0,
  ENCA_SURFACE_EOL_LF    = 1 << 1,
  ENCA_SURFACE_EOL_CRLF  = 1 << 2,
  ENCA_SURFACE_EOL_MIX   = 1 << 3,
  ENCA_SURFACE_EOL_BIN   = 1 << 4,
  ENCA_SURFACE_MASK_EOL  = (ENCA_SURFACE_EOL_CR
                            | ENCA_SURFACE_EOL_LF
                            | ENCA_SURFACE_EOL_CRLF
                            | ENCA_SURFACE_EOL_MIX
                            | ENCA_SURFACE_EOL_BIN),
  ENCA_SURFACE_PERM_21    = 1 << 5,
  ENCA_SURFACE_PERM_4321  = 1 << 6,
  ENCA_SURFACE_PERM_MIX   = 1 << 7,
  ENCA_SURFACE_MASK_PERM  = (ENCA_SURFACE_PERM_21
                             | ENCA_SURFACE_PERM_4321
                             | ENCA_SURFACE_PERM_MIX),
  ENCA_SURFACE_QP        = 1 << 8,
  ENCA_SURFACE_REMOVE    = 1 << 13,
  ENCA_SURFACE_UNKNOWN   = 1 << 14,
  ENCA_SURFACE_MASK_ALL  = (ENCA_SURFACE_MASK_EOL
                            | ENCA_SURFACE_MASK_PERM
                            | ENCA_SURFACE_QP
                            | ENCA_SURFACE_REMOVE)
} EncaSurface;

typedef enum {
  ENCA_NAME_STYLE_ENCA,
  ENCA_NAME_STYLE_RFC1345,
  ENCA_NAME_STYLE_CSTOCS,
  ENCA_NAME_STYLE_ICONV,
  ENCA_NAME_STYLE_HUMAN,
  ENCA_NAME_STYLE_MIME
} EncaNameStyle;

typedef enum { /*< flags >*/
  ENCA_CHARSET_7BIT      = 1 << 0,
  ENCA_CHARSET_8BIT      = 1 << 1,
  ENCA_CHARSET_16BIT     = 1 << 2,
  ENCA_CHARSET_32BIT     = 1 << 3,
  ENCA_CHARSET_FIXED     = 1 << 4,
  ENCA_CHARSET_VARIABLE  = 1 << 5,
  ENCA_CHARSET_BINARY    = 1 << 6,
  ENCA_CHARSET_REGULAR   = 1 << 7,
  ENCA_CHARSET_MULTIBYTE = 1 << 8
} EncaCharsetFlags;

typedef enum {
  ENCA_EOK = 0,
  ENCA_EINVALUE,
  ENCA_EEMPTY,
  ENCA_EFILTERED,
  ENCA_ENOCS8,
  ENCA_ESIGNIF,
  ENCA_EWINNER,
  ENCA_EGARBAGE
} EncaErrno;

#define ENCA_CS_UNKNOWN (-1)

#define ENCA_NOT_A_CHAR 0xffff

/**
 * SECTION:analyser
 * @short_description: Basic analyser interface.
 *
 * Basically you want to allocate an analyser with enca_analyser_alloc() for some
 * language, use enca_analyse() (or enca_analyse_const()) on a buffer to find its
 * encoding, and interpret the results with something like enca_charset_name().
 * The analyser then can be used for another buffer. Once you no longer need
 * it, call enca_analyser_free() to release it.
 * A single working example is better than a hundred pages of reference manual.
 *
 * A minimal Enca library application &ndash; Czech encoding detector.
 *
 *
 * |[
 * #include <stdio.h>
 * #include <enca.h>
 * int main(void)
 * {
 * EncaAnalyser analyser;
 * EncaEncoding encoding;
 * unsigned char buffer[4096];
 * size_t buflen;
 *
 * buflen = fread(buffer, 1, 4096, stdin);
 * analyser = enca_analyser_alloc("cs");
 * encoding = enca_analyse(analyser, buffer, buflen);
 * printf("Charset: %s\n", enca_charset_name(encoding.charset,
                                             ENCA_NAME_STYLE_HUMAN));
 * enca_analyser_free(analyser);

 * return 0;
 * }
 * ]|
 */
/* Published (opaque) typedefs  */
typedef struct _EncaAnalyserState *EncaAnalyser;
/**
 * SECTION:encodings
 * @short_description: Functions and macros for getting something sensible from #EncaEncoding.
 *
 */
/* Public (transparent) typedefs */
typedef struct _EncaEncoding EncaEncoding;

struct _EncaEncoding { int charset; EncaSurface surface; };

/* Basic interface. */
EncaAnalyser  enca_analyser_alloc (const char *langname);
void          enca_analyser_free  (EncaAnalyser analyser);
EncaEncoding  enca_analyse        (EncaAnalyser analyser,
                                   unsigned char *buffer,
                                   size_t size);
EncaEncoding  enca_analyse_const  (EncaAnalyser analyser,
                                   const unsigned char *buffer,
                                   size_t size);
int           enca_double_utf8_check (EncaAnalyser analyser,
                                      const unsigned char *buffer,
                                      size_t size);
int*          enca_double_utf8_get_candidates (EncaAnalyser analyser);
int           enca_errno          (EncaAnalyser analyser);
const char*   enca_strerror       (EncaAnalyser analyser,
                                   int errnum);

/* Options. */
void          enca_set_multibyte              (EncaAnalyser analyser,
                                               int multibyte);
int           enca_get_multibyte              (EncaAnalyser analyser);
void          enca_set_interpreted_surfaces   (EncaAnalyser analyser,
                                               int interpreted_surfaces);
int           enca_get_interpreted_surfaces   (EncaAnalyser analyser);
void          enca_set_ambiguity              (EncaAnalyser analyser,
                                               int ambiguity);
int           enca_get_ambiguity              (EncaAnalyser analyser);
void          enca_set_filtering              (EncaAnalyser analyser,
                                               int filtering);
int           enca_get_filtering              (EncaAnalyser analyser);
void          enca_set_garbage_test           (EncaAnalyser analyser,
                                               int garabage_test);
int           enca_get_garbage_test           (EncaAnalyser analyser);
void          enca_set_termination_strictness (EncaAnalyser analyser,
                                               int termination_strictness);
int           enca_get_termination_strictness (EncaAnalyser analyser);
int           enca_set_significant            (EncaAnalyser analyser,
                                               size_t significant);
size_t        enca_get_significant            (EncaAnalyser analyser);
int           enca_set_threshold              (EncaAnalyser analyser,
                                               double threshold);
double        enca_get_threshold              (EncaAnalyser analyser);

/* Names and properties. */
const char*       enca_charset_name            (int charset,
                                                EncaNameStyle whatname);
const char**      enca_get_charset_aliases     (int charset,
                                                size_t *n);
char*             enca_get_surface_name        (EncaSurface surface,
                                                EncaNameStyle whatname);
EncaEncoding      enca_parse_encoding_name     (const char *name);
EncaSurface       enca_charset_natural_surface (int charset);
EncaCharsetFlags  enca_charset_properties      (int charset);

#define enca_charset_is_known(cs) \
  ((cs) != ENCA_CS_UNKNOWN)
#define enca_charset_is_7bit(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_7BIT)
#define enca_charset_is_8bit(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_8BIT)
#define enca_charset_is_16bit(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_16BIT)
#define enca_charset_is_32bit(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_32BIT)
#define enca_charset_is_fixed(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_FIXED)
#define enca_charset_is_variable(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_VARIABLE)
#define enca_charset_is_binary(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_BINARY)
#define enca_charset_is_regular(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_REGULAR)
#define enca_charset_is_multibyte(cs) \
  (enca_charset_properties(cs) & ENCA_CHARSET_MULTIBYTE)

/**
 * SECTION:auxiliary
 * @short_description: Variouis auxiliary functions and informations about libenca.
 *
 */
/* Auxuliary functions. */
int           enca_charset_has_ucs2_map  (int charset);
int           enca_charset_ucs2_map      (int charset,
                                          unsigned int *buffer);
size_t        enca_number_of_charsets    (void);
const char*   enca_analyser_language     (EncaAnalyser analyser);
const char*   enca_language_english_name (const char *lang);
const char**  enca_get_languages         (size_t *n);
int*          enca_get_language_charsets (const char *langname,
                                          size_t *n);
#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif