File: Encoding.cpp

package info (click to toggle)
sight 21.1.1-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 36,592 kB
  • sloc: cpp: 228,341; xml: 19,066; ansic: 9,854; python: 302; sh: 135; makefile: 32
file content (333 lines) | stat: -rw-r--r-- 10,658 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
/************************************************************************
 *
 * Copyright (C) 2009-2022 IRCAD France
 * Copyright (C) 2012-2019 IHU Strasbourg
 *
 * This file is part of Sight.
 *
 * Sight is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Sight is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Sight. If not, see <https://www.gnu.org/licenses/>.
 *
 ***********************************************************************/

#include "io/dicom/helper/Encoding.hpp"

#include <core/log/Logger.hpp>
#include <core/spyLog.hpp>

#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/locale/encoding.hpp>

namespace sight::io::dicom
{

namespace helper
{

const Encoding::DefinedTermToCharsetMapType Encoding::s_DEFINED_TERM_TO_CHARSET = {
    // ASCII
    {"ISO_IR 6", ""},
    {"ISO 2022 IR 6", ""},

    // Latin alphabet No. 1
    {"ISO_IR 100", "ISO-8859-1"},
    {"ISO 2022 IR 100", "ISO-8859-1"},

    // Latin alphabet No. 2
    {"ISO_IR 101", "ISO-8859-2"},
    {"ISO 2022 IR 101", "ISO-8859-2"},

    // Latin alphabet No. 3
    {"ISO_IR 109", "ISO-8859-3"},
    {"ISO 2022 IR 109", "ISO-8859-3"},

    // Latin alphabet No. 4
    {"ISO_IR 110", "ISO-8859-4"},
    {"ISO 2022 IR 110", "ISO-8859-4"},

    // Cyrillic
    {"ISO_IR 144", "ISO-8859-5"},
    {"ISO 2022 IR 144", "ISO-8859-5"},

    // Arabic
    {"ISO_IR 127", "ISO-8859-6"},
    {"ISO 2022 IR 127", "ISO-8859-6"},

    // Greek
    {"ISO_IR 126", "ISO-8859-7"},
    {"ISO 2022 IR 126", "ISO-8859-7"},

    // Hebrew
    {"ISO_IR 138", "ISO-8859-8"},
    {"ISO 2022 IR 138", "ISO-8859-8"},

    // Latin alphabet No. 5
    {"ISO_IR 148", "ISO-8859-9"},
    {"ISO 2022 IR 148", "ISO-8859-9"},

    // Japanese { may require "ISO-IR-13" },
    {"ISO_IR 13", "JIS_X0201"},
    {"ISO 2022 IR 13", "JIS_X0201"},

    // Thai
    {"ISO_IR 166", "ISO-IR-166"},
    {"ISO 2022 IR 166", "ISO-IR-166"},

    // Japanese
    {"ISO 2022 IR 87", "ISO-IR-87"},
    {"ISO 2022 IR 159", "ISO-IR-159"},

    // Korean
    // - is this mapping really correct?
    {"ISO 2022 IR 149", "EUC-KR"},

    // Simplified Chinese
    // - is this mapping really correct?
    {"ISO 2022 IR 58", "GB2312"},

    // Unicode in UTF-8 { multi-byte },
    {"ISO_IR 192", "UTF-8"},

    // Chinese { multi-byte },
    {"GB18030", "GB18030"},

    // Simplified Chinese { multi-byte },
    {"GBK", "GBK"},
};

const Encoding::EscapeSequenceToCharsetMapType Encoding::s_ESCAPE_SEQUENCE_TO_CHARSET = {
    {{0x28, 0x42}, {"ISO 2022 IR 6", ""}},             // ASCII
    {{0x2d, 0x41}, {"ISO 2022 IR 100", "ISO-8859-1"}}, // Latin alphabet No. 1
    {{0x2d, 0x42}, {"ISO 2022 IR 101", "ISO-8859-2"}}, // Latin alphabet No. 2
    {{0x2d, 0x43}, {"ISO 2022 IR 109", "ISO-8859-3"}}, // Latin alphabet No. 3
    {{0x2d, 0x44}, {"ISO 2022 IR 110", "ISO-8859-4"}}, // Latin alphabet No. 4
    {{0x2d, 0x4c}, {"ISO 2022 IR 144", "ISO-8859-5"}}, // Cyrillic
    {{0x2d, 0x47}, {"ISO 2022 IR 127", "ISO-8859-6"}}, // Arabic
    {{0x2d, 0x46}, {"ISO 2022 IR 126", "ISO-8859-7"}}, // Greek
    {{0x2d, 0x48}, {"ISO 2022 IR 138", "ISO-8859-8"}}, // Hebrew
    {{0x2d, 0x4d}, {"ISO 2022 IR 148", "ISO-8859-9"}}, // Latin alphabet No. 5
    {{0x29, 0x49}, {"ISO 2022 IR 13", "JIS_X0201"}},   // Japanese { may require "ISO-IR-13"}
    {{0x28, 0x4a}, {"ISO 2022 IR 13", "ISO-IR-14"}},   // Japanese
    {{0x2d, 0x54}, {"ISO 2022 IR 166", "ISO-IR-166"}}, // Thai
    {{0x24, 0x42}, {"ISO 2022 IR 87", "ISO-IR-87"}} // Japanese { multi-byte}
};

//------------------------------------------------------------------------------

std::string Encoding::convertString(
    const std::string& source,
    const std::string& definedCharsetTerm,
    const core::log::Logger::sptr& logger
)
{
    if(source.empty())
    {
        return "";
    }

    // Retrieve DICOM Specific Character Set List
    std::vector<std::string> definedTermList;
    boost::split(definedTermList, definedCharsetTerm, boost::is_any_of("\\"));

    // Only one charset without code extension techniques is used
    if(definedCharsetTerm.empty() || definedTermList.size() == 1)
    {
        return convertStringWithoutCodeExtensions(source, definedCharsetTerm, logger);
    }
    // Several charsets with code extension techniques are used
    else
    {
        // If the attribute Specific Character Set (0008,0005) has more than one value
        // and value 1 is empty, it is assumed that value 1 is ISO 2022 IR 6.
        if(definedTermList[0].empty())
        {
            definedTermList[0] = "ISO 2022 IR 6";
        }

        // Check for characters ESC delimiter
        std::vector<std::string> sequenceList;
        boost::split(sequenceList, source, boost::is_any_of("\033"));

        std::string result;

        // Add the first part
        if(source[0] != '\033')
        {
            result += convertStringWithoutCodeExtensions(sequenceList[0], definedTermList[0], logger);
        }
        else
        {
            result += Encoding::convertSequenceWithCodeExtensions(sequenceList[0], definedTermList, logger);
        }

        // Convert remaining sequences according to specific charsets
        std::vector<std::string>::iterator it = ++sequenceList.begin();
        for( ; it != sequenceList.end() ; ++it)
        {
            result += convertSequenceWithCodeExtensions(*it, definedTermList, logger);
        }

        return result;
    }
}

//------------------------------------------------------------------------------

std::string Encoding::convertStringWithoutCodeExtensions(
    const std::string& source,
    const std::string& definedTerm,
    const core::log::Logger::sptr& logger
)
{
    std::string charset;

    if(definedTerm.empty()) // assuming ASCII (according to DICOM PS 3.5)
    {
        charset = "";
    }
    else
    {
        SIGHT_WARN_IF(
            "'ISO_IR 6' is not a defined term in DICOM, will be treated as an empty value (ASCII)",
            definedTerm == "ISO_IR 6"
        );

        // Check that the defined term is known
        if(s_DEFINED_TERM_TO_CHARSET.find(definedTerm) != s_DEFINED_TERM_TO_CHARSET.end())
        {
            charset = s_DEFINED_TERM_TO_CHARSET.at(definedTerm);
        }
        else
        {
            const std::string msg = "'" + definedTerm + "' is not a defined term in DICOM, "
                                                        "will be treated as an empty value (ASCII)";

            SIGHT_WARN_IF(msg, !logger);
            if(logger)
            {
                logger->warning(msg);
            }

            charset = "";
        }
    }

    // Empty value treated as ASCII
    if(charset.empty())
    {
        return source;
    }
    else
    {
        return boost::locale::conv::to_utf<char>(source, charset);
    }
}

//------------------------------------------------------------------------------

void checkDefinedTermDeclaration(
    const std::string& definedTerm,
    const std::vector<std::string>& definedTermList,
    const core::log::Logger::sptr& logger
)
{
    if(std::find(definedTermList.begin(), definedTermList.end(), definedTerm) == definedTermList.end())
    {
        const std::string msg = "Escape sequence refers to character set '" + definedTerm
                                + "' that was not declared in SpecificCharacterSet (0008,0005).";

        SIGHT_WARN_IF(msg, !logger);
        if(logger)
        {
            logger->warning(msg);
        }
    }
}

//------------------------------------------------------------------------------

std::string Encoding::convertSequenceWithCodeExtensions(
    const std::string& sequence,
    const std::vector<std::string>& definedTermList,
    const core::log::Logger::sptr& logger
)
{
    // We need at least two more characters to determine the new character set
    SIGHT_THROW_IF("Cannot convert character set: Incomplete escape sequence.", sequence.size() < 2);

    const char c1 = sequence[0];
    const char c2 = sequence[1];

    unsigned short escapeSize = 2;

    EscapeSequenceType escapeSequence                   = std::make_pair(c1, c2);
    DefinedTermAndCharsetPairType definedTermAndCharset = std::make_pair("", "");

    if(s_ESCAPE_SEQUENCE_TO_CHARSET.find(escapeSequence) != s_ESCAPE_SEQUENCE_TO_CHARSET.end())
    {
        definedTermAndCharset = s_ESCAPE_SEQUENCE_TO_CHARSET.at(escapeSequence);
    }
    else if((c1 == 0x24) && (c2 == 0x28)) // Japanese (multi-byte)
    {
        // Do we still have another character in the string?
        if(sequence.size() >= 3)
        {
            escapeSize = 3;
            if(sequence[2] == 0x44)
            {
                definedTermAndCharset = std::make_pair("ISO 2022 IR 159", "ISO-IR-159");
            }
        }
    }
    else if((c1 == 0x24) && (c2 == 0x29))
    {
        // Do we still have another character in the string?
        if(sequence.size() >= 3)
        {
            escapeSize = 3;
            if(sequence[2] == 0x43) // Korean (multi-byte)
            {
                // - is this mapping really correct?
                definedTermAndCharset = std::make_pair("ISO 2022 IR 149", "EUC-KR");
            }
            else if(sequence[2] == 0x41) // Simplified Chinese (multi-byte)
            {
                // - is this mapping really correct?
                definedTermAndCharset = std::make_pair("ISO 2022 IR 58", "GB2312");
            }
        }
    }

    // Check that a definedTerm has been found
    SIGHT_THROW_IF("Unable to retrieve character set from escape sequence.", definedTermAndCharset.first.empty());

    // Check that the defined term has been declared in SpecificCharacterSet (0008,0005)
    checkDefinedTermDeclaration(definedTermAndCharset.first, definedTermList, logger);

    // Empty value treated as ASCII
    if(definedTermAndCharset.second.empty())
    {
        return sequence.substr(escapeSize);
    }
    else
    {
        return boost::locale::conv::to_utf<char>(sequence.substr(escapeSize), definedTermAndCharset.second);
    }
}

//------------------------------------------------------------------------------

} //namespace helper

} //namespace sight::io::dicom