1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2005-2012, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef __CSRMBCS_H
#define __CSRMBCS_H
#include <_foundation_unicode/utypes.h>
#if !UCONFIG_NO_CONVERSION
#include "csrecog.h"
U_NAMESPACE_BEGIN
// "Character" iterated character class.
// Recognizers for specific mbcs encodings make their "characters" available
// by providing a nextChar() function that fills in an instance of IteratedChar
// with the next char from the input.
// The returned characters are not converted to Unicode, but remain as the raw
// bytes (concatenated into an int) from the codepage data.
//
// For Asian charsets, use the raw input rather than the input that has been
// stripped of markup. Detection only considers multi-byte chars, effectively
// stripping markup anyway, and double byte chars do occur in markup too.
//
class IteratedChar : public UMemory
{
public:
uint32_t charValue; // 1-4 bytes from the raw input data
int32_t index;
int32_t nextIndex;
UBool error;
UBool done;
public:
IteratedChar();
//void reset();
int32_t nextByte(InputText* det);
};
#if APPLE_ICU_CHANGES && U_PLATFORM_IS_DARWIN_BASED
// rdar://10748760 a11f10f892.. When MBCS detectors have few 2-byte chars, presence of key strings increases confidence
#define MAX_KEY_STRING_WITH_NULL 16
#endif // APPLE_ICU_CHANGES && U_PLATFORM_IS_DARWIN_BASED
class CharsetRecog_mbcs : public CharsetRecognizer {
protected:
/**
* Test the match of this charset with the input text data
* which is obtained via the CharsetDetector object.
*
* @param det The CharsetDetector, which contains the input text
* to be checked for being in this charset.
* @return Two values packed into one int (Damn java, anyhow)
* <br/>
* bits 0-7: the match confidence, ranging from 0-100
* <br/>
* bits 8-15: The match reason, an enum-like value.
*/
#if APPLE_ICU_CHANGES && U_PLATFORM_IS_DARWIN_BASED
// rdar://10748760 a11f10f892.. When MBCS detectors have few 2-byte chars, presence of key strings increases confidence
// rdar://11810267&11721802 c5bdd7da51.. After ICU50m2 import, re-add rdar://10748760
int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen, const uint8_t (*keyStrings)[MAX_KEY_STRING_WITH_NULL] ) const;
#else
int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
#endif // APPLE_ICU_CHANGES && U_PLATFORM_IS_DARWIN_BASED
public:
virtual ~CharsetRecog_mbcs();
/**
* Get the IANA name of this charset.
* @return the charset name.
*/
const char *getName() const override = 0;
const char *getLanguage() const override = 0;
UBool match(InputText* input, CharsetMatch *results) const override = 0;
/**
* Get the next character (however many bytes it is) from the input data
* Subclasses for specific charset encodings must implement this function
* to get characters according to the rules of their encoding scheme.
*
* This function is not a method of class IteratedChar only because
* that would require a lot of extra derived classes, which is awkward.
* @param it The IteratedChar "struct" into which the returned char is placed.
* @param det The charset detector, which is needed to get at the input byte data
* being iterated over.
* @return True if a character was returned, false at end of input.
*/
virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
};
/**
* Shift-JIS charset recognizer.
*
*/
class CharsetRecog_sjis : public CharsetRecog_mbcs {
public:
virtual ~CharsetRecog_sjis();
UBool nextChar(IteratedChar *it, InputText *det) const override;
UBool match(InputText* input, CharsetMatch *results) const override;
const char *getName() const override;
const char *getLanguage() const override;
};
/**
* EUC charset recognizers. One abstract class that provides the common function
* for getting the next character according to the EUC encoding scheme,
* and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
*
*/
class CharsetRecog_euc : public CharsetRecog_mbcs
{
public:
virtual ~CharsetRecog_euc();
const char *getName() const override = 0;
const char *getLanguage() const override = 0;
UBool match(InputText* input, CharsetMatch *results) const override = 0;
/*
* (non-Javadoc)
* Get the next character value for EUC based encodings.
* Character "value" is simply the raw bytes that make up the character
* packed into an int.
*/
UBool nextChar(IteratedChar *it, InputText *det) const override;
};
/**
* The charset recognize for EUC-JP. A singleton instance of this class
* is created and kept by the public CharsetDetector class
*/
class CharsetRecog_euc_jp : public CharsetRecog_euc
{
public:
virtual ~CharsetRecog_euc_jp();
const char *getName() const override;
const char *getLanguage() const override;
UBool match(InputText* input, CharsetMatch *results) const override;
};
/**
* The charset recognize for EUC-KR. A singleton instance of this class
* is created and kept by the public CharsetDetector class
*/
class CharsetRecog_euc_kr : public CharsetRecog_euc
{
public:
virtual ~CharsetRecog_euc_kr();
const char *getName() const override;
const char *getLanguage() const override;
UBool match(InputText* input, CharsetMatch *results) const override;
};
/**
*
* Big5 charset recognizer.
*
*/
class CharsetRecog_big5 : public CharsetRecog_mbcs
{
public:
virtual ~CharsetRecog_big5();
UBool nextChar(IteratedChar* it, InputText* det) const override;
const char *getName() const override;
const char *getLanguage() const override;
UBool match(InputText* input, CharsetMatch *results) const override;
};
/**
*
* GB-18030 recognizer. Uses simplified Chinese statistics.
*
*/
class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
{
public:
virtual ~CharsetRecog_gb_18030();
UBool nextChar(IteratedChar* it, InputText* det) const override;
const char *getName() const override;
const char *getLanguage() const override;
UBool match(InputText* input, CharsetMatch *results) const override;
};
U_NAMESPACE_END
#endif
#endif /* __CSRMBCS_H */
|