1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
|
/* This header file is in the public domain. */
#ifndef ENCA_H
#define ENCA_H
#include <stdlib.h>
/* According to autoconf stdlib may not be enough for size_t */
#include <sys/types.h>
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/**
* SECTION:enums
* @short_description: Enca library typedefs, enums and constants.
*
*/
/* Enumerated types */
typedef enum { /*< flags >*/
ENCA_SURFACE_EOL_CR = 1 << 0,
ENCA_SURFACE_EOL_LF = 1 << 1,
ENCA_SURFACE_EOL_CRLF = 1 << 2,
ENCA_SURFACE_EOL_MIX = 1 << 3,
ENCA_SURFACE_EOL_BIN = 1 << 4,
ENCA_SURFACE_MASK_EOL = (ENCA_SURFACE_EOL_CR
| ENCA_SURFACE_EOL_LF
| ENCA_SURFACE_EOL_CRLF
| ENCA_SURFACE_EOL_MIX
| ENCA_SURFACE_EOL_BIN),
ENCA_SURFACE_PERM_21 = 1 << 5,
ENCA_SURFACE_PERM_4321 = 1 << 6,
ENCA_SURFACE_PERM_MIX = 1 << 7,
ENCA_SURFACE_MASK_PERM = (ENCA_SURFACE_PERM_21
| ENCA_SURFACE_PERM_4321
| ENCA_SURFACE_PERM_MIX),
ENCA_SURFACE_QP = 1 << 8,
ENCA_SURFACE_REMOVE = 1 << 13,
ENCA_SURFACE_UNKNOWN = 1 << 14,
ENCA_SURFACE_MASK_ALL = (ENCA_SURFACE_MASK_EOL
| ENCA_SURFACE_MASK_PERM
| ENCA_SURFACE_QP
| ENCA_SURFACE_REMOVE)
} EncaSurface;
typedef enum {
ENCA_NAME_STYLE_ENCA,
ENCA_NAME_STYLE_RFC1345,
ENCA_NAME_STYLE_CSTOCS,
ENCA_NAME_STYLE_ICONV,
ENCA_NAME_STYLE_HUMAN,
ENCA_NAME_STYLE_MIME
} EncaNameStyle;
typedef enum { /*< flags >*/
ENCA_CHARSET_7BIT = 1 << 0,
ENCA_CHARSET_8BIT = 1 << 1,
ENCA_CHARSET_16BIT = 1 << 2,
ENCA_CHARSET_32BIT = 1 << 3,
ENCA_CHARSET_FIXED = 1 << 4,
ENCA_CHARSET_VARIABLE = 1 << 5,
ENCA_CHARSET_BINARY = 1 << 6,
ENCA_CHARSET_REGULAR = 1 << 7,
ENCA_CHARSET_MULTIBYTE = 1 << 8
} EncaCharsetFlags;
typedef enum {
ENCA_EOK = 0,
ENCA_EINVALUE,
ENCA_EEMPTY,
ENCA_EFILTERED,
ENCA_ENOCS8,
ENCA_ESIGNIF,
ENCA_EWINNER,
ENCA_EGARBAGE
} EncaErrno;
#define ENCA_CS_UNKNOWN (-1)
#define ENCA_NOT_A_CHAR 0xffff
/**
* SECTION:analyser
* @short_description: Basic analyser interface.
*
* Basically you want to allocate an analyser with enca_analyser_alloc() for some
* language, use enca_analyse() (or enca_analyse_const()) on a buffer to find its
* encoding, and interpret the results with something like enca_charset_name().
* The analyser then can be used for another buffer. Once you no longer need
* it, call enca_analyser_free() to release it.
* A single working example is better than a hundred pages of reference manual.
*
* A minimal Enca library application – Czech encoding detector.
*
*
* |[
* #include <stdio.h>
* #include <enca.h>
* int main(void)
* {
* EncaAnalyser analyser;
* EncaEncoding encoding;
* unsigned char buffer[4096];
* size_t buflen;
*
* buflen = fread(buffer, 1, 4096, stdin);
* analyser = enca_analyser_alloc("cs");
* encoding = enca_analyse(analyser, buffer, buflen);
* printf("Charset: %s\n", enca_charset_name(encoding.charset,
ENCA_NAME_STYLE_HUMAN));
* enca_analyser_free(analyser);
* return 0;
* }
* ]|
*/
/* Published (opaque) typedefs */
typedef struct _EncaAnalyserState *EncaAnalyser;
/**
* SECTION:encodings
* @short_description: Functions and macros for getting something sensible from #EncaEncoding.
*
*/
/* Public (transparent) typedefs */
typedef struct _EncaEncoding EncaEncoding;
struct _EncaEncoding { int charset; EncaSurface surface; };
/* Basic interface. */
EncaAnalyser enca_analyser_alloc (const char *langname);
void enca_analyser_free (EncaAnalyser analyser);
EncaEncoding enca_analyse (EncaAnalyser analyser,
unsigned char *buffer,
size_t size);
EncaEncoding enca_analyse_const (EncaAnalyser analyser,
const unsigned char *buffer,
size_t size);
int enca_double_utf8_check (EncaAnalyser analyser,
const unsigned char *buffer,
size_t size);
int* enca_double_utf8_get_candidates (EncaAnalyser analyser);
int enca_errno (EncaAnalyser analyser);
const char* enca_strerror (EncaAnalyser analyser,
int errnum);
/* Options. */
void enca_set_multibyte (EncaAnalyser analyser,
int multibyte);
int enca_get_multibyte (EncaAnalyser analyser);
void enca_set_interpreted_surfaces (EncaAnalyser analyser,
int interpreted_surfaces);
int enca_get_interpreted_surfaces (EncaAnalyser analyser);
void enca_set_ambiguity (EncaAnalyser analyser,
int ambiguity);
int enca_get_ambiguity (EncaAnalyser analyser);
void enca_set_filtering (EncaAnalyser analyser,
int filtering);
int enca_get_filtering (EncaAnalyser analyser);
void enca_set_garbage_test (EncaAnalyser analyser,
int garabage_test);
int enca_get_garbage_test (EncaAnalyser analyser);
void enca_set_termination_strictness (EncaAnalyser analyser,
int termination_strictness);
int enca_get_termination_strictness (EncaAnalyser analyser);
int enca_set_significant (EncaAnalyser analyser,
size_t significant);
size_t enca_get_significant (EncaAnalyser analyser);
int enca_set_threshold (EncaAnalyser analyser,
double threshold);
double enca_get_threshold (EncaAnalyser analyser);
/* Names and properties. */
const char* enca_charset_name (int charset,
EncaNameStyle whatname);
const char** enca_get_charset_aliases (int charset,
size_t *n);
char* enca_get_surface_name (EncaSurface surface,
EncaNameStyle whatname);
EncaEncoding enca_parse_encoding_name (const char *name);
EncaSurface enca_charset_natural_surface (int charset);
EncaCharsetFlags enca_charset_properties (int charset);
#define enca_charset_is_known(cs) \
((cs) != ENCA_CS_UNKNOWN)
#define enca_charset_is_7bit(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_7BIT)
#define enca_charset_is_8bit(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_8BIT)
#define enca_charset_is_16bit(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_16BIT)
#define enca_charset_is_32bit(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_32BIT)
#define enca_charset_is_fixed(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_FIXED)
#define enca_charset_is_variable(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_VARIABLE)
#define enca_charset_is_binary(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_BINARY)
#define enca_charset_is_regular(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_REGULAR)
#define enca_charset_is_multibyte(cs) \
(enca_charset_properties(cs) & ENCA_CHARSET_MULTIBYTE)
/**
* SECTION:auxiliary
* @short_description: Variouis auxiliary functions and informations about libenca.
*
*/
/* Auxuliary functions. */
int enca_charset_has_ucs2_map (int charset);
int enca_charset_ucs2_map (int charset,
unsigned int *buffer);
size_t enca_number_of_charsets (void);
const char* enca_analyser_language (EncaAnalyser analyser);
const char* enca_language_english_name (const char *lang);
const char** enca_get_languages (size_t *n);
int* enca_get_language_charsets (const char *langname,
size_t *n);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif
|