1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
|
/*=========================================================================
Program: DICOM for VTK
Copyright (c) 2012-2024 David Gobbi
All rights reserved.
See Copyright.txt or http://dgobbi.github.io/bsd3.txt for details.
This software is distributed WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the above copyright notice for more information.
=========================================================================*/
#ifndef vtkDICOMCharacterSet_h
#define vtkDICOMCharacterSet_h
#include "vtkSystemIncludes.h"
#include "vtkDICOMModule.h" // For export macro
#include "vtkDICOMConfig.h" // For configuration details
#include <string>
//! Character sets.
/*!
* This class provides the means to convert the various international
* text encodings used by DICOM to UTF-8 and back again.
*
* During conversion to UTF-8, any codes from the original encoding that
* can't be converted are replaced by Unicode's "REPLACEMENT CHARACTER",
* which is a question mark in a black diamond. For instance, if the
* original encoding is ISO_IR_6 (ASCII), any octets outside of the
* valid ASCII range of 0 to 127 will become "REPLACEMENT CHARACTER".
*
* DICOM supports a fairly small number of single-byte and multi-byte
* character sets. The only VRs that support these character sets are
* PN, LO, SH, ST, LT, and ST (all other text VRs must be ASCII). In
* addition to ASCII, there are twelve 8-bit single-byte encodings,
* three iso-2022 multi-byte encodings, and three variable-length
* encodings (UTF-8, GB18030, GBK).
*
* In some DICOM data sets, especially old ones, the SpecificCharacterSet
* attribute will be missing and it might be necessary to manually specify
* a character set for the application to use. Use SetGlobalDefault() to
* do so. The vtkDICOMCharacterSet constructor can take the desired
* character encoding as a string, where the following encodings are
* allowed: 'ascii', 'latin1', 'latin2', 'latin3', 'latin4', 'latin5'
* 'latin7', 'latin9', 'cyrillic' (iso-8859-5), 'arabic' (iso-8859-6),
* 'greek' (iso-8859-7), 'hebrew' (iso-8859-8), 'tis-620', 'shift-jis',
* 'euc-jp', 'iso-2022-jp', 'korean' (euc-kr), 'chinese' (gb2312), 'gbk',
* 'gb18030', 'big5', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254',
* 'cp1255', 'cp1256', 'cp1257', 'cp1258', and 'utf-8'. Common aliases
* of these character sets can also be used.
*/
class VTKDICOM_EXPORT vtkDICOMCharacterSet
{
public:
enum EnumType
{
ISO_IR_6 = 0, // US_ASCII
ISO_IR_13 = 1, // JIS X 0201, japanese romaji + katakana
ISO_IR_100 = 8, // ISO-8859-1, latin1, western europe
ISO_IR_101 = 9, // ISO-8859-2, latin2, central europe
ISO_IR_109 = 10, // ISO-8859-3, latin3, maltese
ISO_IR_110 = 11, // ISO-8859-4, latin4, baltic
ISO_IR_144 = 12, // ISO-8859-5, cyrillic
ISO_IR_127 = 13, // ISO-8859-6, arabic
ISO_IR_126 = 14, // ISO-8859-7, greek
ISO_IR_138 = 15, // ISO-8859-8, hebrew
ISO_IR_148 = 16, // ISO-8859-9, latin5, turkish
X_LATIN6 = 17, // ISO-8859-10, latin6, nordic
ISO_IR_166 = 18, // ISO-8859-11, thai
X_LATIN7 = 19, // ISO-8859-13, latin7, baltic rim
X_LATIN8 = 20, // ISO-8859-14, latin8, celtic
ISO_IR_203 = 21, // ISO-8859-15, latin9, western europe
X_LATIN9 = 21, // key from before ISO_IR 203 entered DICOM
X_LATIN10 = 22, // ISO-8859-16, latin10, southeastern europe
X_EUCKR = 24, // euc-kr, korean without escape codes
X_GB2312 = 25, // gb2312, chinese without escape codes
ISO_2022_IR_6 = 32, // US_ASCII
ISO_2022_IR_13 = 33, // JIS X 0201, japanese romaji and katakana
ISO_2022_IR_87 = 34, // JIS X 0208, iso-2022-jp with ascii
ISO_2022_IR_13_87 = 35, // JIS X 0201+0208, iso-2022-jp with romaji
ISO_2022_IR_159 = 36, // JIS X 0212, japanese supplementary
ISO_2022_IR_87_159 = 38, // JIS X 0208+0212, iso-2022-jp-2 subset
ISO_2022_IR_13_87_159 = 39, // JIS X 0201+0208+0212, iso-2022-jp-2 subset
ISO_2022_IR_100 = 40, // ISO-8859-1, latin1, western europe
ISO_2022_IR_101 = 41, // ISO-8859-2, latin2, central europe
ISO_2022_IR_109 = 42, // ISO-8859-3, latin3, maltese
ISO_2022_IR_110 = 43, // ISO-8859-4, latin4, baltic
ISO_2022_IR_144 = 44, // ISO-8859-5, cyrillic
ISO_2022_IR_127 = 45, // ISO-8859-6, arabic
ISO_2022_IR_126 = 46, // ISO-8859-7, greek
ISO_2022_IR_138 = 47, // ISO-8859-8, hebrew
ISO_2022_IR_148 = 48, // ISO-8859-9, latin5, turkish
ISO_2022_IR_166 = 50, // ISO-8859-11, thai
ISO_2022_IR_203 = 53, // ISO-8859-15, latin9, western europe
ISO_2022_IR_149 = 56, // KS X 1001, korean in G1 with escape codes
ISO_2022_IR_58 = 57, // GB2312, chinese in G1 with escape codes
X_ISO_2022_JP = 58, // iso-2022-jp with ascii and romaji
X_ISO_2022_JP_1 = 59, // like above, with addition of JIS X 0212
X_ISO_2022_JP_2 = 60, // adds chinese, korean, latin1, greek
X_ISO_2022_JP_EXT = 61, // iso-2022-jp-1 plus half-width katakana
ISO_IR_192 = 64, // UTF-8, unicode
GB18030 = 65, // gb18030, chinese with full unicode mapping
GBK = 66, // gbk, chinese
X_BIG5 = 67, // big5 + ETEN, traditional chinese
X_EUCJP = 69, // euc-jp, unix encoding for japanese
X_SJIS = 70, // windows-31j, aka shift-jis, code page 932
X_CP874 = 76, // cp1162, thai (windows-874)
X_CP1250 = 80, // cp1250, central europe
X_CP1251 = 81, // cp1251, cyrillic
X_CP1252 = 82, // cp1252, western europe
X_CP1253 = 83, // cp1253, greek
X_CP1254 = 84, // cp1254, turkish
X_CP1255 = 85, // cp1255, hebrew
X_CP1256 = 86, // cp1256, arabic
X_CP1257 = 87, // cp1257, baltic rim
X_CP1258 = 88, // cp1258, vietnamese
X_KOI8 = 90, // koi8, cyrillic
Unknown = 255 // signifies unknown character set
};
//@{
//! Construct an object that describes the default (ASCII) character set.
vtkDICOMCharacterSet() : Key(0) {}
//! Construct a character set object from a given code.
/*!
* The code can be any of the enumerated code values. The ISO_2022 flag
* can be added to any of the ISO-8859 codes to indicate that the character
* set allows the use of escape codes. Also note that ISO_2022_IR_87 and
* ISO_2022_IR_159 are combining codes that can be added to each other and
* to ISO_IR_13. Specifying any other codes in combination can lead to
* undefined results, for example "ISO_2022_IR_100 | ISO_2022_IR_101" is
* not permitted and "ISO_2022_IR_100" must be used instead.
*/
vtkDICOMCharacterSet(int k) : Key(static_cast<unsigned char>(k)) {}
//! Construct a character set object from a SpecificCharacterSet value.
/*!
* This generates an 8-bit code that uniquely identifies a DICOM
* character set plus its code extensions.
*/
explicit vtkDICOMCharacterSet(const std::string& name) {
this->Key = KeyFromString(name.data(), name.length()); }
vtkDICOMCharacterSet(const char *name, size_t nl) {
this->Key = KeyFromString(name, nl); }
//@}
//@{
//! Set the character set to use if SpecificCharacterSet is missing.
/*!
* Some DICOM files do not list a SpecificCharacterSet attribute, but
* nevertheless use a non-ASCII character encoding. This method can be
* used to specify the character set in absence of SpecificCharacterSet.
* If SpecificCharacterSet is present, the default will not override it
* unless OverrideCharacterSet is true.
*/
static void SetGlobalDefault(vtkDICOMCharacterSet cs) {
GlobalDefault = cs.GetKey(); }
static vtkDICOMCharacterSet GetGlobalDefault() {
return GlobalDefault; }
//! Override the value stored in SpecificCharacterSet with the default.
/*!
* This method can be used if the SpecificCharacterSet attribute of a
* file is incorrect. It forces the use of the character set that
* was set with SetGlobalDefault.
*/
static void SetGlobalOverride(bool b) {
GlobalOverride = b; }
static void GlobalOverrideOn() { GlobalOverride = true; }
static void GlobalOverrideOff() { GlobalOverride = false; }
static bool GetGlobalOverride() { return GlobalOverride; }
//@}
//@{
//! Generate SpecificCharacterSet code values (diagnostic only).
/*!
* This will return the same value as GetDefinedTerm() if a defined
* term exists, otherwise it return the same value as GetName() if the
* character set has a name, with a final fallback to the number
* returned by GetKey() converted to a string.
*/
std::string GetCharacterSetString() const;
//! Get the defined term (possible multi-valued) for this character set.
/*!
* If the character set permitted by the DICOM standard, this will return
* the defined term, otherwise the returned value will be NULL. An empty
* string is returned for the default character set (ISO_IR 6). Multiple
* values will be separated by backslashes, e.g. "\\ISO 2022 IR 58" or
* "ISO 2022 IR 13\\ISO 2022 IR 87".
*/
const char *GetDefinedTerm() const;
//! Get the internet MIME name for this character set.
/*!
* The return value will be NULL if there isn't a good match between this
* character set and one of the MIME character sets in common use on the
* internet. So conversion may be necessary, either to UTF-8 or to a
* different encoding with a similar character repertoire. For example,
* "ISO 2022 IR 149" can be converted to "EUC-KR", "ISO 2022 IR 58" can
* can be converted to "GBK", and "ISO 2022 IR 13\\ISO 2022 IR 87" can be
* converted to "Shift_JIS". Note that "ISO-2022-JP" is not equivalent
* to DICOM's Japanese encodings since it does not allow half-width
* katakana or the "ISO 2022 IR 159" characters.
*/
const char *GetMIMEName() const;
//! Get a name that identifies this character set.
/*!
* For DICOM character sets, the name is based on the defined term,
* and for other character sets, the common name is used. If no name
* exists, then "Unknown" will be returned.
*/
const char *GetName() const;
//! Get the numerical code for this character set object.
unsigned char GetKey() const { return this->Key; }
//@}
//@{
//! Convert text from UTF-8 to this encoding.
/*!
* Attempt to convert from UTF-8 to this character set. Every
* non-convertible character will be replaced with '?'. If you pass
* a non-null value for the "lp" parameter, then "lp" will be set to
* the position in the input UTF-8 string where the first conversion
* error occurred, and the unconverted character will be output as
* \<U+XXXX\> instead of '?'. If the conversion was error-free, then
* "lp" will be set to the length of the input string.
*/
std::string FromUTF8(const char *text, size_t l, size_t *lp=nullptr) const;
std::string FromUTF8(const std::string& text) const {
return FromUTF8(text.data(), text.length()); }
//! Convert text from this encoding to UTF-8.
/*!
* This will convert text to UTF-8, which is generally a lossless
* process for western languages but not for the CJK languages.
* Characters that cannot be mapped to unicode, or whose place in
* unicode is not known, will be printed as unicode U+FFFD which
* appears as a question mark in a diamond. If you pass a non-null
* value for the "lp" parameter, then "lp" will be set the position
* in the input string where the first conversion error occurred, and
* each unconverted byte will be output as \<XX\> (a hexadecimal code
* in angle brackets). If an error-free conversion was returned, then
* "lp" will be set to the length of the input string.
*/
std::string ToUTF8(const char *text, size_t l, size_t *lp=nullptr) const;
std::string ToUTF8(const std::string& text) const {
return ToUTF8(text.data(), text.length()); }
//! Obsolete method for converting to UTF8.
std::string ConvertToUTF8(const char *text, size_t l) const;
//! Convert text to UTF-8 that is safe to print to the console.
/*!
* All control characters or unconvertible characters will be replaced
* by four-byte octal codes, e.g. '\\033'. Backslashes will be replaced
* by '\\134' to avoid any potential ambiguity.
*/
std::string ToSafeUTF8(const char *text, size_t l) const;
std::string ToSafeUTF8(const std::string& text) const {
return ToSafeUTF8(text.data(), text.length()); }
//! Convert text into a form suitable for case-insensitive matching.
/*!
* This function will perform case normalization on a string by
* converting it to lowercase, and by normalizing the forms of
* lowercase characters that do not have an exact uppercase
* equivalent. In some cases, it might increase the length of
* the string. It covers modern European scripts (including Greek
* and Cyrillic) and latin characters used in East Asian languages.
*/
std::string CaseFoldedUTF8(const char *text, size_t l) const;
std::string CaseFoldedUTF8(const std::string& text) const {
return CaseFoldedUTF8(text.data(), text.length()); }
//! Returns true if ISO 2022 escape codes are used.
/*!
* If this method returns true, then escape codes can be used to
* switch between character sets.
*/
bool IsISO2022() const {
return ((this->Key & ISO_2022_MAX) == (this->Key | ISO_2022_MIN));
}
//! Returns true if this uses an ISO 8859 code page.
bool IsISO8859() const {
return (this->Key >= ISO_IR_100 && this->Key <= X_LATIN10);
}
//! Check for bidirectional character sets.
/*!
* This is used to check for character sets that are likely to
* contain characters that print right-to-left, specifically Hebrew
* and Arabic. Note that even though some parts of unicode fall
* into this category, this flag is off for unicode and GB18030/GBK.
*/
bool IsBiDirectional() const {
return (this->Key == ISO_IR_127 ||
this->Key == ISO_IR_138 ||
this->Key == X_CP1255 ||
this->Key == X_CP1256); }
//@}
//@{
//! Count the number of backslashes in an encoded string.
/*!
* The backslash byte is sometimes present as half of a multibyte
* character in the Japanese and Chinese encodings. This method
* skips these false backslashes and counts only real backslashes.
*/
unsigned int CountBackslashes(const char *text, size_t l) const;
//! Get the offset to the next backslash, or to the end of the string.
/*!
* In order to work properly, this method requires that its input is
* either at the beginning of the string or just after a backslash.
*/
size_t NextBackslash(const char *text, const char *end) const;
//@}
//@{
bool operator==(vtkDICOMCharacterSet b) const { return (this->Key == b.Key); }
bool operator!=(vtkDICOMCharacterSet b) const { return (this->Key != b.Key); }
bool operator<=(vtkDICOMCharacterSet a) const { return (this->Key <= a.Key); }
bool operator>=(vtkDICOMCharacterSet a) const { return (this->Key >= a.Key); }
bool operator<(vtkDICOMCharacterSet a) const { return (this->Key < a.Key); }
bool operator>(vtkDICOMCharacterSet a) const { return (this->Key > a.Key); }
//@}
private:
// ISO-2022 Escape Codes
enum EscapeType {
CODE_ACS, // Announcer Code Sequence
CODE_CZD, // C0 Designate
CODE_C1D, // C1 Designate
CODE_GZD, // G0 Designate
CODE_G1D, // G1 Designate
CODE_G2D, // G2 Designate
CODE_G3D, // G3 Designate
CODE_DOCS, // Designate Other Coding System
CODE_CMD, // Coding Method Delimiter
CODE_IRR, // Identify Revised Registration
CODE_SS2, // Single Shift Two
CODE_SS3, // Single Shift Three
CODE_LS2, // Locking Shift Two
CODE_LS3, // Locking Shift Three
CODE_LS1R, // Locking Shift One Right
CODE_LS2R, // Locking Shift Two Right
CODE_LS3R, // Locking Shift Three Right
CODE_OTHER = 254, // Unrecognized
CODE_ERROR = 255 // Failure indicator
};
// ISO-2022 State Bitfield
enum StateType {
ALTERNATE_CS = 0x00FF,
MULTIBYTE_G0 = 0x0100,
MULTIBYTE_G1 = 0x0200,
MULTIBYTE_G2 = 0x0400,
MULTIBYTE_G3 = 0x0800,
CHARSET96_GX = 0x1000,
CHARSET96_G1 = 0x2000,
CHARSET96_G2 = 0x4000,
CHARSET96_G3 = 0x8000
};
// Other ISO-2022
enum {
DICOM_JP_BITS = 39,
ISO_2022_BASE = 31,
ISO_2022_MIN = 32,
ISO_2022_MAX = 63
};
size_t AnyToUTF8(const char *t, size_t l, std::string *s, int m) const;
size_t UTF8ToSingleByte(const char *t, size_t l, std::string *s, int m) const;
size_t SingleByteToUTF8(const char *t, size_t l, std::string *s, int m) const;
size_t ISO8859ToUTF8(const char *t, size_t l, std::string *s, int m) const;
size_t UTF8ToISO2022(const char *t, size_t l, std::string *s, int m) const;
size_t ISO2022ToUTF8(const char *t, size_t l, std::string *s, int m) const;
size_t UTF8ToEUCKR(const char *t, size_t l, std::string *s, int m) const;
static size_t EUCKRToUTF8(const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToGB2312(const char *t, size_t l, std::string *s, int m);
static size_t GB2312ToUTF8(const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToGB18030(const char *t, size_t l, std::string *s, int m);
static size_t GB18030ToUTF8(const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToGBK(const char *t, size_t l, std::string *s, int m);
static size_t GBKToUTF8(const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToBig5(const char *t, size_t l, std::string *s, int m);
static size_t Big5ToUTF8(const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToEUCJP(const char *t, size_t l, std::string *s, int m);
static size_t EUCJPToUTF8(const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToSJIS(const char *t, size_t l, std::string *s, int m);
static size_t SJISToUTF8(const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToJISX(
int charset, const char *t, size_t l, std::string *s, int m);
static size_t JISXToUTF8(
int csGL, int csGR, const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToCP1258(const char *t, size_t l, std::string *s, int m);
static size_t CP1258ToUTF8(const char *t, size_t l, std::string *s, int m);
static size_t UTF8ToJISX0201(const char *t, size_t l, std::string *s, int m);
unsigned int InitISO2022(unsigned char G[4]) const;
static EscapeType EscapeCode(const char *cp, size_t l, unsigned int *state);
unsigned char CharacterSetFromEscapeCodeJP(const char *code, size_t l) const;
unsigned char CharacterSetFromEscapeCode(const char *code, size_t l) const;
static unsigned char KeyFromString(const char *name, size_t nl);
unsigned char Key;
static unsigned char GlobalDefault;
static bool GlobalOverride;
static const unsigned short *Table[256];
static const unsigned short *Reverse[256];
static const int NumberOfAliases;
static const char *const Aliases[];
static const unsigned char AliasKeys[];
};
VTKDICOM_EXPORT ostream& operator<<(ostream& o, const vtkDICOMCharacterSet& a);
#endif /* vtkDICOMCharacterSet_h */
// VTK-HeaderTest-Exclude: vtkDICOMCharacterSet.h
|