File: vtkDICOMCharacterSet.h

package info (click to toggle)
vtk-dicom 0.8.17-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,176 kB
  • sloc: cpp: 113,811; python: 2,041; makefile: 43; tcl: 10
file content (431 lines) | stat: -rw-r--r-- 19,451 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
/*=========================================================================

  Program: DICOM for VTK

  Copyright (c) 2012-2024 David Gobbi
  All rights reserved.
  See Copyright.txt or http://dgobbi.github.io/bsd3.txt for details.

     This software is distributed WITHOUT ANY WARRANTY; without even
     the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     PURPOSE.  See the above copyright notice for more information.

=========================================================================*/
#ifndef vtkDICOMCharacterSet_h
#define vtkDICOMCharacterSet_h

#include "vtkSystemIncludes.h"
#include "vtkDICOMModule.h" // For export macro
#include "vtkDICOMConfig.h" // For configuration details

#include <string>

//! Character sets.
/*!
 *  This class provides the means to convert the various international
 *  text encodings used by DICOM to UTF-8 and back again.
 *
 *  During conversion to UTF-8, any codes from the original encoding that
 *  can't be converted are replaced by Unicode's "REPLACEMENT CHARACTER",
 *  which is a question mark in a black diamond.  For instance, if the
 *  original encoding is ISO_IR_6 (ASCII), any octets outside of the
 *  valid ASCII range of 0 to 127 will become "REPLACEMENT CHARACTER".
 *
 *  DICOM supports a fairly small number of single-byte and multi-byte
 *  character sets.  The only VRs that support these character sets are
 *  PN, LO, SH, ST, LT, and ST (all other text VRs must be ASCII). In
 *  addition to ASCII, there are twelve 8-bit single-byte encodings,
 *  three iso-2022 multi-byte encodings, and three variable-length
 *  encodings (UTF-8, GB18030, GBK).
 *
 *  In some DICOM data sets, especially old ones, the SpecificCharacterSet
 *  attribute will be missing and it might be necessary to manually specify
 *  a character set for the application to use.  Use SetGlobalDefault() to
 *  do so.  The vtkDICOMCharacterSet constructor can take the desired
 *  character encoding as a string, where the following encodings are
 *  allowed: 'ascii', 'latin1', 'latin2', 'latin3', 'latin4', 'latin5'
 *  'latin7', 'latin9', 'cyrillic' (iso-8859-5), 'arabic' (iso-8859-6),
 *  'greek' (iso-8859-7), 'hebrew' (iso-8859-8), 'tis-620', 'shift-jis',
 *  'euc-jp', 'iso-2022-jp', 'korean' (euc-kr), 'chinese' (gb2312), 'gbk',
 *  'gb18030', 'big5', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254',
 *  'cp1255', 'cp1256', 'cp1257', 'cp1258', and 'utf-8'.  Common aliases
 *  of these character sets can also be used.
 */
class VTKDICOM_EXPORT vtkDICOMCharacterSet
{
public:
  enum EnumType
  {
    ISO_IR_6   = 0,  // US_ASCII
    ISO_IR_13  = 1,  // JIS X 0201,  japanese romaji + katakana
    ISO_IR_100 = 8,  // ISO-8859-1,  latin1, western europe
    ISO_IR_101 = 9,  // ISO-8859-2,  latin2, central europe
    ISO_IR_109 = 10, // ISO-8859-3,  latin3, maltese
    ISO_IR_110 = 11, // ISO-8859-4,  latin4, baltic
    ISO_IR_144 = 12, // ISO-8859-5,  cyrillic
    ISO_IR_127 = 13, // ISO-8859-6,  arabic
    ISO_IR_126 = 14, // ISO-8859-7,  greek
    ISO_IR_138 = 15, // ISO-8859-8,  hebrew
    ISO_IR_148 = 16, // ISO-8859-9,  latin5, turkish
    X_LATIN6   = 17, // ISO-8859-10, latin6, nordic
    ISO_IR_166 = 18, // ISO-8859-11, thai
    X_LATIN7   = 19, // ISO-8859-13, latin7, baltic rim
    X_LATIN8   = 20, // ISO-8859-14, latin8, celtic
    ISO_IR_203 = 21, // ISO-8859-15, latin9, western europe
    X_LATIN9   = 21, // key from before ISO_IR 203 entered DICOM
    X_LATIN10  = 22, // ISO-8859-16, latin10, southeastern europe
    X_EUCKR    = 24, // euc-kr,      korean without escape codes
    X_GB2312   = 25, // gb2312,      chinese without escape codes
    ISO_2022_IR_6   = 32, // US_ASCII
    ISO_2022_IR_13  = 33, // JIS X 0201,  japanese romaji and katakana
    ISO_2022_IR_87  = 34, // JIS X 0208,  iso-2022-jp with ascii
    ISO_2022_IR_13_87 = 35, // JIS X 0201+0208, iso-2022-jp with romaji
    ISO_2022_IR_159 = 36, // JIS X 0212,  japanese supplementary
    ISO_2022_IR_87_159 = 38, // JIS X 0208+0212, iso-2022-jp-2 subset
    ISO_2022_IR_13_87_159 = 39, // JIS X 0201+0208+0212, iso-2022-jp-2 subset
    ISO_2022_IR_100 = 40, // ISO-8859-1,  latin1, western europe
    ISO_2022_IR_101 = 41, // ISO-8859-2,  latin2, central europe
    ISO_2022_IR_109 = 42, // ISO-8859-3,  latin3, maltese
    ISO_2022_IR_110 = 43, // ISO-8859-4,  latin4, baltic
    ISO_2022_IR_144 = 44, // ISO-8859-5,  cyrillic
    ISO_2022_IR_127 = 45, // ISO-8859-6,  arabic
    ISO_2022_IR_126 = 46, // ISO-8859-7,  greek
    ISO_2022_IR_138 = 47, // ISO-8859-8,  hebrew
    ISO_2022_IR_148 = 48, // ISO-8859-9,  latin5, turkish
    ISO_2022_IR_166 = 50, // ISO-8859-11, thai
    ISO_2022_IR_203 = 53, // ISO-8859-15, latin9, western europe
    ISO_2022_IR_149 = 56, // KS X 1001, korean in G1 with escape codes
    ISO_2022_IR_58  = 57, // GB2312, chinese in G1 with escape codes
    X_ISO_2022_JP   = 58, // iso-2022-jp with ascii and romaji
    X_ISO_2022_JP_1 = 59, // like above, with addition of JIS X 0212
    X_ISO_2022_JP_2 = 60, // adds chinese, korean, latin1, greek
    X_ISO_2022_JP_EXT = 61, // iso-2022-jp-1 plus half-width katakana
    ISO_IR_192 = 64, // UTF-8,       unicode
    GB18030    = 65, // gb18030,     chinese with full unicode mapping
    GBK        = 66, // gbk,         chinese
    X_BIG5     = 67, // big5 + ETEN, traditional chinese
    X_EUCJP    = 69, // euc-jp,      unix encoding for japanese
    X_SJIS     = 70, // windows-31j, aka shift-jis, code page 932
    X_CP874    = 76, // cp1162,      thai (windows-874)
    X_CP1250   = 80, // cp1250,      central europe
    X_CP1251   = 81, // cp1251,      cyrillic
    X_CP1252   = 82, // cp1252,      western europe
    X_CP1253   = 83, // cp1253,      greek
    X_CP1254   = 84, // cp1254,      turkish
    X_CP1255   = 85, // cp1255,      hebrew
    X_CP1256   = 86, // cp1256,      arabic
    X_CP1257   = 87, // cp1257,      baltic rim
    X_CP1258   = 88, // cp1258,      vietnamese
    X_KOI8     = 90, // koi8,        cyrillic
    Unknown    = 255  // signifies unknown character set
  };

  //@{
  //! Construct an object that describes the default (ASCII) character set.
  vtkDICOMCharacterSet() : Key(0) {}

  //! Construct a character set object from a given code.
  /*!
   *  The code can be any of the enumerated code values.  The ISO_2022 flag
   *  can be added to any of the ISO-8859 codes to indicate that the character
   *  set allows the use of escape codes.  Also note that ISO_2022_IR_87 and
   *  ISO_2022_IR_159 are combining codes that can be added to each other and
   *  to ISO_IR_13.  Specifying any other codes in combination can lead to
   *  undefined results, for example "ISO_2022_IR_100 | ISO_2022_IR_101" is
   *  not permitted and "ISO_2022_IR_100" must be used instead.
   */
  vtkDICOMCharacterSet(int k) : Key(static_cast<unsigned char>(k)) {}

  //! Construct a character set object from a SpecificCharacterSet value.
  /*!
   *  This generates an 8-bit code that uniquely identifies a DICOM
   *  character set plus its code extensions.
   */
  explicit vtkDICOMCharacterSet(const std::string& name) {
    this->Key = KeyFromString(name.data(), name.length()); }
  vtkDICOMCharacterSet(const char *name, size_t nl) {
    this->Key = KeyFromString(name, nl); }
  //@}

  //@{
  //! Set the character set to use if SpecificCharacterSet is missing.
  /*!
   *  Some DICOM files do not list a SpecificCharacterSet attribute, but
   *  nevertheless use a non-ASCII character encoding.  This method can be
   *  used to specify the character set in absence of SpecificCharacterSet.
   *  If SpecificCharacterSet is present, the default will not override it
   *  unless OverrideCharacterSet is true.
   */
  static void SetGlobalDefault(vtkDICOMCharacterSet cs) {
    GlobalDefault = cs.GetKey(); }
  static vtkDICOMCharacterSet GetGlobalDefault() {
    return GlobalDefault; }

  //! Override the value stored in SpecificCharacterSet with the default.
  /*!
   *  This method can be used if the SpecificCharacterSet attribute of a
   *  file is incorrect.  It forces the use of the character set that
   *  was set with SetGlobalDefault.
   */
  static void SetGlobalOverride(bool b) {
    GlobalOverride = b; }
  static void GlobalOverrideOn() { GlobalOverride = true; }
  static void GlobalOverrideOff() { GlobalOverride = false; }
  static bool GetGlobalOverride() { return GlobalOverride; }
  //@}

  //@{
  //! Generate SpecificCharacterSet code values (diagnostic only).
  /*!
   *  This will return the same value as GetDefinedTerm() if a defined
   *  term exists, otherwise it return the same value as GetName() if the
   *  character set has a name, with a final fallback to the number
   *  returned by GetKey() converted to a string.
   */
  std::string GetCharacterSetString() const;

  //! Get the defined term (possible multi-valued) for this character set.
  /*!
   *  If the character set permitted by the DICOM standard, this will return
   *  the defined term, otherwise the returned value will be NULL.  An empty
   *  string is returned for the default character set (ISO_IR 6).  Multiple
   *  values will be separated by backslashes, e.g. "\\ISO 2022 IR 58" or
   *  "ISO 2022 IR 13\\ISO 2022 IR 87".
   */
  const char *GetDefinedTerm() const;

  //! Get the internet MIME name for this character set.
  /*!
   *  The return value will be NULL if there isn't a good match between this
   *  character set and one of the MIME character sets in common use on the
   *  internet.  So conversion may be necessary, either to UTF-8 or to a
   *  different encoding with a similar character repertoire. For example,
   *  "ISO 2022 IR 149" can be converted to "EUC-KR", "ISO 2022 IR 58" can
   *  can be converted to "GBK", and "ISO 2022 IR 13\\ISO 2022 IR 87" can be
   *  converted to "Shift_JIS".  Note that "ISO-2022-JP" is not equivalent
   *  to DICOM's Japanese encodings since it does not allow half-width
   *  katakana or the "ISO 2022 IR 159" characters.
   */
  const char *GetMIMEName() const;

  //! Get a name that identifies this character set.
  /*!
   *  For DICOM character sets, the name is based on the defined term,
   *  and for other character sets, the common name is used.  If no name
   *  exists, then "Unknown" will be returned.
   */
  const char *GetName() const;

  //! Get the numerical code for this character set object.
  unsigned char GetKey() const { return this->Key; }
  //@}

  //@{
  //! Convert text from UTF-8 to this encoding.
  /*!
   *  Attempt to convert from UTF-8 to this character set.  Every
   *  non-convertible character will be replaced with '?'.  If you pass
   *  a non-null value for the "lp" parameter, then "lp" will be set to
   *  the position in the input UTF-8 string where the first conversion
   *  error occurred, and the unconverted character will be output as
   *  \<U+XXXX\> instead of '?'.  If the conversion was error-free, then
   *  "lp" will be set to the length of the input string.
  */
  std::string FromUTF8(const char *text, size_t l, size_t *lp=nullptr) const;
  std::string FromUTF8(const std::string& text) const {
    return FromUTF8(text.data(), text.length()); }

  //! Convert text from this encoding to UTF-8.
  /*!
   *  This will convert text to UTF-8, which is generally a lossless
   *  process for western languages but not for the CJK languages.
   *  Characters that cannot be mapped to unicode, or whose place in
   *  unicode is not known, will be printed as unicode U+FFFD which
   *  appears as a question mark in a diamond.  If you pass a non-null
   *  value for the "lp" parameter, then "lp" will be set the position
   *  in the input string where the first conversion error occurred, and
   *  each unconverted byte will be output as \<XX\> (a hexadecimal code
   *  in angle brackets).  If an error-free conversion was returned, then
   *  "lp" will be set to the length of the input string.
   */
  std::string ToUTF8(const char *text, size_t l, size_t *lp=nullptr) const;
  std::string ToUTF8(const std::string& text) const {
    return ToUTF8(text.data(), text.length()); }

  //! Obsolete method for converting to UTF8.
  std::string ConvertToUTF8(const char *text, size_t l) const;

  //! Convert text to UTF-8 that is safe to print to the console.
  /*!
   *  All control characters or unconvertible characters will be replaced
   *  by four-byte octal codes, e.g. '\\033'.  Backslashes will be replaced
   *  by '\\134' to avoid any potential ambiguity.
   */
  std::string ToSafeUTF8(const char *text, size_t l) const;
  std::string ToSafeUTF8(const std::string& text) const {
    return ToSafeUTF8(text.data(), text.length()); }

  //! Convert text into a form suitable for case-insensitive matching.
  /*!
   *  This function will perform case normalization on a string by
   *  converting it to lowercase, and by normalizing the forms of
   *  lowercase characters that do not have an exact uppercase
   *  equivalent.  In some cases, it might increase the length of
   *  the string.  It covers modern European scripts (including Greek
   *  and Cyrillic) and latin characters used in East Asian languages.
   */
  std::string CaseFoldedUTF8(const char *text, size_t l) const;
  std::string CaseFoldedUTF8(const std::string& text) const {
    return CaseFoldedUTF8(text.data(), text.length()); }

  //! Returns true if ISO 2022 escape codes are used.
  /*!
   *  If this method returns true, then escape codes can be used to
   *  switch between character sets.
   */
  bool IsISO2022() const {
    return ((this->Key & ISO_2022_MAX) == (this->Key | ISO_2022_MIN));
  }

  //! Returns true if this uses an ISO 8859 code page.
  bool IsISO8859() const {
    return (this->Key >= ISO_IR_100 && this->Key <= X_LATIN10);
  }

  //! Check for bidirectional character sets.
  /*!
   *  This is used to check for character sets that are likely to
   *  contain characters that print right-to-left, specifically Hebrew
   *  and Arabic.  Note that even though some parts of unicode fall
   *  into this category, this flag is off for unicode and GB18030/GBK.
   */
  bool IsBiDirectional() const {
    return (this->Key == ISO_IR_127 ||
            this->Key == ISO_IR_138 ||
            this->Key == X_CP1255 ||
            this->Key == X_CP1256); }
  //@}

  //@{
  //! Count the number of backslashes in an encoded string.
  /*!
   *  The backslash byte is sometimes present as half of a multibyte
   *  character in the Japanese and Chinese encodings.  This method
   *  skips these false backslashes and counts only real backslashes.
   */
  unsigned int CountBackslashes(const char *text, size_t l) const;

  //! Get the offset to the next backslash, or to the end of the string.
  /*!
   *  In order to work properly, this method requires that its input is
   *  either at the beginning of the string or just after a backslash.
   */
  size_t NextBackslash(const char *text, const char *end) const;
  //@}

  //@{
  bool operator==(vtkDICOMCharacterSet b) const { return (this->Key == b.Key); }
  bool operator!=(vtkDICOMCharacterSet b) const { return (this->Key != b.Key); }
  bool operator<=(vtkDICOMCharacterSet a) const { return (this->Key <= a.Key); }
  bool operator>=(vtkDICOMCharacterSet a) const { return (this->Key >= a.Key); }
  bool operator<(vtkDICOMCharacterSet a) const { return (this->Key < a.Key); }
  bool operator>(vtkDICOMCharacterSet a) const { return (this->Key > a.Key); }
  //@}

private:

  // ISO-2022 Escape Codes
  enum EscapeType {
    CODE_ACS,  // Announcer Code Sequence
    CODE_CZD,  // C0 Designate
    CODE_C1D,  // C1 Designate
    CODE_GZD,  // G0 Designate
    CODE_G1D,  // G1 Designate
    CODE_G2D,  // G2 Designate
    CODE_G3D,  // G3 Designate
    CODE_DOCS, // Designate Other Coding System
    CODE_CMD,  // Coding Method Delimiter
    CODE_IRR,  // Identify Revised Registration
    CODE_SS2,  // Single Shift Two
    CODE_SS3,  // Single Shift Three
    CODE_LS2,  // Locking Shift Two
    CODE_LS3,  // Locking Shift Three
    CODE_LS1R, // Locking Shift One Right
    CODE_LS2R, // Locking Shift Two Right
    CODE_LS3R,  // Locking Shift Three Right
    CODE_OTHER = 254, // Unrecognized
    CODE_ERROR = 255 // Failure indicator
  };

  // ISO-2022 State Bitfield
  enum StateType {
    ALTERNATE_CS = 0x00FF,
    MULTIBYTE_G0 = 0x0100,
    MULTIBYTE_G1 = 0x0200,
    MULTIBYTE_G2 = 0x0400,
    MULTIBYTE_G3 = 0x0800,
    CHARSET96_GX = 0x1000,
    CHARSET96_G1 = 0x2000,
    CHARSET96_G2 = 0x4000,
    CHARSET96_G3 = 0x8000
  };

  // Other ISO-2022
  enum {
    DICOM_JP_BITS = 39,
    ISO_2022_BASE = 31,
    ISO_2022_MIN = 32,
    ISO_2022_MAX = 63
  };

  size_t AnyToUTF8(const char *t, size_t l, std::string *s, int m) const;
  size_t UTF8ToSingleByte(const char *t, size_t l, std::string *s, int m) const;
  size_t SingleByteToUTF8(const char *t, size_t l, std::string *s, int m) const;
  size_t ISO8859ToUTF8(const char *t, size_t l, std::string *s, int m) const;
  size_t UTF8ToISO2022(const char *t, size_t l, std::string *s, int m) const;
  size_t ISO2022ToUTF8(const char *t, size_t l, std::string *s, int m) const;
  size_t UTF8ToEUCKR(const char *t, size_t l, std::string *s, int m) const;
  static size_t EUCKRToUTF8(const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToGB2312(const char *t, size_t l, std::string *s, int m);
  static size_t GB2312ToUTF8(const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToGB18030(const char *t, size_t l, std::string *s, int m);
  static size_t GB18030ToUTF8(const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToGBK(const char *t, size_t l, std::string *s, int m);
  static size_t GBKToUTF8(const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToBig5(const char *t, size_t l, std::string *s, int m);
  static size_t Big5ToUTF8(const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToEUCJP(const char *t, size_t l, std::string *s, int m);
  static size_t EUCJPToUTF8(const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToSJIS(const char *t, size_t l, std::string *s, int m);
  static size_t SJISToUTF8(const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToJISX(
    int charset, const char *t, size_t l, std::string *s, int m);
  static size_t JISXToUTF8(
    int csGL, int csGR, const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToCP1258(const char *t, size_t l, std::string *s, int m);
  static size_t CP1258ToUTF8(const char *t, size_t l, std::string *s, int m);
  static size_t UTF8ToJISX0201(const char *t, size_t l, std::string *s, int m);

  unsigned int InitISO2022(unsigned char G[4]) const;
  static EscapeType EscapeCode(const char *cp, size_t l, unsigned int *state);
  unsigned char CharacterSetFromEscapeCodeJP(const char *code, size_t l) const;
  unsigned char CharacterSetFromEscapeCode(const char *code, size_t l) const;
  static unsigned char KeyFromString(const char *name, size_t nl);

  unsigned char Key;

  static unsigned char GlobalDefault;
  static bool GlobalOverride;

  static const unsigned short *Table[256];
  static const unsigned short *Reverse[256];

  static const int NumberOfAliases;
  static const char *const Aliases[];
  static const unsigned char AliasKeys[];
};

VTKDICOM_EXPORT ostream& operator<<(ostream& o, const vtkDICOMCharacterSet& a);

#endif /* vtkDICOMCharacterSet_h */
// VTK-HeaderTest-Exclude: vtkDICOMCharacterSet.h