1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2003-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucm.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003jun20
* created by: Markus W. Scherer
*
* Definitions for the .ucm file parser and handler module ucm.c.
*/
#ifndef __UCM_H__
#define __UCM_H__
#include "unicode/utypes.h"
#include "ucnvmbcs.h"
#include "ucnv_ext.h"
#include "filestrm.h"
#include <stdio.h>
#if !UCONFIG_NO_CONVERSION
U_CDECL_BEGIN
/* constants for UCMapping.moveFlag */
enum {
UCM_MOVE_TO_EXT=1,
UCM_REMOVE_MAPPING=2
};
/*
* Per-mapping data structure
*
* u if uLen==1: Unicode code point
* else index to uLen code points
* b if bLen<=4: up to 4 bytes
* else index to bLen bytes
* uLen number of code points
* bLen number of words containing left-justified bytes
* bIsMultipleChars indicates that the bytes contain more than one sequence
* according to the state table
* f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
* or "good one-way" mapping (4).
* Same values as in the source file after |
*/
typedef struct UCMapping {
UChar32 u;
union {
uint32_t idx;
uint8_t bytes[4];
} b;
int8_t uLen, bLen, f, moveFlag;
} UCMapping;
/* constants for UCMTable.flagsType */
enum {
UCM_FLAGS_INITIAL, /* no mappings parsed yet */
UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
UCM_FLAGS_MIXED /* both implicit and explicit */
};
typedef struct UCMTable {
UCMapping *mappings;
int32_t mappingsCapacity, mappingsLength;
UChar32 *codePoints;
int32_t codePointsCapacity, codePointsLength;
uint8_t *bytes;
int32_t bytesCapacity, bytesLength;
/* index map for mapping by bytes first */
int32_t *reverseMap;
uint8_t unicodeMask;
int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
UBool isSorted;
} UCMTable;
enum {
MBCS_STATE_FLAG_DIRECT=1,
MBCS_STATE_FLAG_SURROGATES,
MBCS_STATE_FLAG_READY=16
};
typedef struct UCMStates {
int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
stateOffsetSum[MBCS_MAX_STATE_COUNT];
int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
int8_t conversionType, outputType;
} UCMStates;
typedef struct UCMFile {
UCMTable *base, *ext;
UCMStates states;
char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
} UCMFile;
/* simple accesses ---------------------------------------------------------- */
#define UCM_GET_CODE_POINTS(t, m) \
(((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
#define UCM_GET_BYTES(t, m) \
(((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx)
/* APIs --------------------------------------------------------------------- */
U_CAPI UCMFile * U_EXPORT2
ucm_open(void);
U_CAPI void U_EXPORT2
ucm_close(UCMFile *ucm);
U_CAPI UBool U_EXPORT2
ucm_parseHeaderLine(UCMFile *ucm,
char *line, char **pKey, char **pValue);
/* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */
U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates *baseStates,
UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES]);
/* add a mapping to the base or extension table as appropriate */
U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES]);
U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
U_CAPI UCMTable * U_EXPORT2
ucm_openTable(void);
U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable *table);
U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable *table);
U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable *t);
/*
* Remove mappings with their move flag set from the base table
* and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
*/
U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable *base, UCMTable *ext);
/**
* Read a table from a .ucm file, from after the CHARMAP line to
* including the END CHARMAP line.
*/
U_CAPI void U_EXPORT2
ucm_readTable(UCMFile *ucm, FileStream* convFile,
UBool forBase, UCMStates *baseStates,
UErrorCode *pErrorCode);
/**
* Check the validity of mappings against a base table's states;
* necessary for extension-only tables that were read before their base tables.
*/
U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
/**
* Check a base table against an extension table.
* Set the moveTarget!=NULL if it is possible to move mappings from the base.
* This is the case where base and extension tables are parsed from a single file
* (moveTarget==ext)
* or when delta file mappings are subtracted from a base table.
*
* When a base table cannot be modified because a delta file is parsed in makeconv,
* then set moveTarget=NULL.
*
* if(intersectBase) then mappings that exist in the base table but not in
* the extension table are moved to moveTarget instead of showing an error.
*
* Special mode:
* If intersectBase==2 for a DBCS extension table, then SBCS mappings are
* not moved out of the base unless their Unicode input requires it.
* This helps ucmkbase generate base tables for DBCS-only extension .cnv files.
*
* For both tables in the same file, the extension table is automatically
* built.
* For separate files, the extension file can use a complete mapping table (.ucm file),
* so that common mappings need not be stripped out manually.
*
*
* Sort both tables, and then for each mapping direction:
*
* If intersectBase is true and the base table contains a mapping
* that does not exist in the extension table, then this mapping is moved
* to moveTarget.
*
* - otherwise -
*
* If the base table contains a mapping for which the input sequence is
* the same as the extension input, then
* - if the output is the same: remove the extension mapping
* - else: error
*
* If the base table contains a mapping for which the input sequence is
* a prefix of the extension input, then
* - if moveTarget!=NULL: move the base mapping to the moveTarget table
* - else: error
*
* @return false in case of an irreparable error
*/
U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
UCMTable *moveTarget, UBool intersectBase);
U_CAPI void U_EXPORT2
ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
U_CAPI void U_EXPORT2
ucm_addState(UCMStates *states, const char *s);
U_CAPI void U_EXPORT2
ucm_processStates(UCMStates *states, UBool ignoreSISOCheck);
U_CAPI int32_t U_EXPORT2
ucm_countChars(UCMStates *states,
const uint8_t *bytes, int32_t length);
U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES],
const char *line);
U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable *table,
UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES]);
/* very makeconv-specific functions ----------------------------------------- */
/* finalize and optimize states after the toUnicode mappings are processed */
U_CAPI void U_EXPORT2
ucm_optimizeStates(UCMStates *states,
uint16_t **pUnicodeCodeUnits,
_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
UBool verbose);
/* moved here because it is used inside ucmstate.c */
U_CAPI int32_t U_EXPORT2
ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
uint32_t offset);
/* very rptp2ucm-specific functions ----------------------------------------- */
/*
* Input: Separate tables with mappings from/to Unicode,
* subchar and subchar1 (0 if none).
* All mappings must have flag 0.
*
* Output: fromUTable will contain the union of mappings with the correct
* precision flags, and be sorted.
*/
U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
const uint8_t *subchar, int32_t subcharLength,
uint8_t subchar1);
U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile *ucm, UBool isSISO);
U_CDECL_END
#endif
#endif
|