1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
|
#include "unicode.hh"
#include "configuration.hh"
#include "game.hh"
#include <regex>
#include <stdexcept>
#include <unicode/unistr.h>
#include <unicode/ustream.h>
#include <unicode/ubidi.h>
#include "../ced/compact_enc_det/compact_enc_det.h"
std::unique_ptr<icu::RuleBasedCollator> UnicodeUtil::m_searchCollator;
std::unique_ptr<icu::RuleBasedCollator> UnicodeUtil::m_sortCollator;
std::map<std::string, Converter> UnicodeUtil::m_converters{};
Converter::Converter(std::string const& codepage): m_codepage(codepage), m_converter(nullptr, &ucnv_close) {
m_converter = std::unique_ptr<UConverter, decltype(&ucnv_close)>(ucnv_open(m_codepage.c_str(), m_error), &ucnv_close);
if (m_error.isFailure()) throw std::runtime_error("unicode/error: " + std::to_string(m_error.get()) + ": " + std::string(m_error.errorName()));
}
Converter::Converter(Converter&& c) noexcept:
m_codepage(std::move(c.m_codepage)),
m_converter(std::move(c.m_converter)),
m_error(std::move(c.m_error)) {}
icu::UnicodeString Converter::convertToUTF8(std::string_view sv) {
std::scoped_lock l(m_lock);
icu::UnicodeString ret(sv.data(), -1, m_converter.get(), m_error);
if (m_error.isFailure()) throw std::runtime_error("Couldn't convert string: " + std::string(sv) + " to UTF-8. Error: " + std::to_string(m_error.get()) + ": " + m_error.errorName());
return ret;
}
Converter& UnicodeUtil::getConverter(std::string const& s) {
return m_converters.try_emplace(s, Converter(s)).first->second;
}
std::string UnicodeUtil::getCharset (std::string_view& str) {
int bytes_consumed;
bool is_reliable;
if (removeUTF8BOM(str)) return "UTF-8";
Encoding encoding = CompactEncDet::DetectEncoding(
str.data(), static_cast<int>(str.size()),
nullptr, nullptr, nullptr,
UNKNOWN_ENCODING,
UNKNOWN_LANGUAGE,
CompactEncDet::WEB_CORPUS,
true,
&bytes_consumed,
&is_reliable);
if (!is_reliable) {
std::clog << "unicode/warning: detected encoding (" <<
MimeEncodingName(encoding) << ") for text: " <<
((str.size() <= 256) ? str : str.substr(0,255)) <<
" was flagged as not reliable." <<
std::endl; // Magic number, so sue me.
}
return MimeEncodingName(encoding);
}
std::string UnicodeUtil::convertToUTF8 (std::string_view str, std::string _filename, CaseMapping toCase, bool assumeUTF8) {
icu::UnicodeString ustring;
std::string charset;
if (assumeUTF8) charset = "UTF-8";
else charset = UnicodeUtil::getCharset(str);
if (charset != "UTF-8") {
if (!_filename.empty()) std::clog << "unicode/info: " << _filename << " does not appear to be UTF-8; (" << charset << ") detected." << std::endl;
ustring = UnicodeUtil::getConverter(charset).convertToUTF8(str);
}
else { ustring = icu::UnicodeString::fromUTF8(str.data()); }
switch(toCase) {
case CaseMapping::UPPER:
ustring.toUpper();
break;
case CaseMapping::LOWER:
ustring.toLower();
break;
case CaseMapping::TITLE:
ustring.toTitle(0, icu::Locale(TranslationEngine::getCurrentLanguageCode().c_str()), U_TITLECASE_NO_LOWERCASE);
break;
case CaseMapping::NONE:
break;
}
std::string ret;
if (!ustring.isEmpty()) {
ustring.toUTF8String(ret);
}
else {
if (!ret.empty()) {
std::clog << "unicode/error: tried to convert text in an unknown encoding: " << charset << std::endl;
}
}
return ret;
}
bool UnicodeUtil::removeUTF8BOM(std::string_view& str) {
// Test for UTF-8 BOM (a three-byte sequence at the beginning of a file)
if (str.substr(0, 3) == "\xEF\xBB\xBF") {
str = str.substr(3); // Remove BOM if there is one
return true;
}
return false;
}
bool UnicodeUtil::caseEqual (std::string_view lhs, std::string_view rhs, bool assumeUTF8) {
if (lhs == rhs) return true; // Early return
std::string lhsCharset = UnicodeUtil::getCharset(lhs);
std::string rhsCharset = UnicodeUtil::getCharset(rhs);;
icu::UnicodeString lhsUniString;
icu::UnicodeString rhsUniString;
if (lhsCharset != "UTF-8" && !assumeUTF8) {
lhsUniString = UnicodeUtil::getConverter(lhsCharset).convertToUTF8(lhs);
}
else lhsUniString = icu::UnicodeString::fromUTF8(lhs.data());
if (rhsCharset != "UTF-8" && !assumeUTF8) {
rhsUniString = UnicodeUtil::getConverter(rhsCharset).convertToUTF8(rhs);
}
else rhsUniString = icu::UnicodeString::fromUTF8(rhs.data());
int8_t result = lhsUniString.caseCompare(rhsUniString, U_FOLD_CASE_DEFAULT);
return (result == 0);
}
bool UnicodeUtil::isRTL(std::string_view str) {
bool _return = false;
icu::ErrorCode _unicodeError;
std::string charset(UnicodeUtil::getCharset(str));
icu::UnicodeString ustring = UnicodeUtil::getConverter(charset).convertToUTF8(str);
std::unique_ptr<UBiDi,void(*)(UBiDi*)> _uBiDiObj(
ubidi_open(),
[](UBiDi* p) {
if (p != nullptr) {
ubidi_close(p);
}
});
ubidi_setPara(
_uBiDiObj.get(),
ustring.getBuffer(),
-1,
UBIDI_DEFAULT_LTR,
nullptr,
_unicodeError
);
if (_unicodeError.isSuccess()) _return = (ubidi_getDirection(_uBiDiObj.get()) != UBIDI_LTR);
else {
std::clog << "unicode/warning: Error (" << std::to_string(_unicodeError.get()) << ": " << _unicodeError.errorName() << "), determining text direction for: " << str << ", will assume LTR." << std::endl;
}
return _return;
}
std::string UnicodeUtil::toLower (std::string_view str) {
return convertToUTF8 (str, "", CaseMapping::LOWER);
}
std::string UnicodeUtil::toUpper (std::string_view str) {
return convertToUTF8 (str, "", CaseMapping::UPPER);
}
std::string UnicodeUtil::toTitle (std::string_view str) {
return convertToUTF8 (str, "", CaseMapping::TITLE);
}
void UnicodeUtil::collate (songMetadata& stringmap) {
for (auto const& [key, value]: stringmap) {
ConfigItem::StringList termsToCollate = config["game/sorting_ignore"].sl();
std::string pattern = std::string ("^((");
for (auto term : termsToCollate) {
if (term != termsToCollate.front()) {
pattern += std::string("|");
}
pattern += term;
if (term == termsToCollate.back()) {
pattern += std::string(")\\s(.+))$");
}
}
std::string collatedString = regex_replace(convertToUTF8(value),
std::regex(pattern, std::regex_constants::icase), "$3,$2");
stringmap[key] = collatedString;
}
}
|