File: unicode.cc

package info (click to toggle)
performous 1.3.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 63,316 kB
  • sloc: cpp: 35,856; sh: 927; python: 631; xml: 480; makefile: 38
file content (182 lines) | stat: -rw-r--r-- 6,141 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#include "unicode.hh"

#include "configuration.hh"
#include "game.hh"

#include <regex>
#include <stdexcept>
#include <unicode/unistr.h>
#include <unicode/ustream.h>
#include <unicode/ubidi.h>
#include "../ced/compact_enc_det/compact_enc_det.h"

std::unique_ptr<icu::RuleBasedCollator> UnicodeUtil::m_searchCollator;
std::unique_ptr<icu::RuleBasedCollator> UnicodeUtil::m_sortCollator;

std::map<std::string, Converter> UnicodeUtil::m_converters{};

Converter::Converter(std::string const& codepage): m_codepage(codepage), m_converter(nullptr, &ucnv_close) {
	m_converter = std::unique_ptr<UConverter, decltype(&ucnv_close)>(ucnv_open(m_codepage.c_str(), m_error), &ucnv_close);
	if (m_error.isFailure()) throw std::runtime_error("unicode/error: " + std::to_string(m_error.get()) + ": " + std::string(m_error.errorName()));
}

Converter::Converter(Converter&& c) noexcept:
	m_codepage(std::move(c.m_codepage)),
	m_converter(std::move(c.m_converter)),
	m_error(std::move(c.m_error)) {}

icu::UnicodeString Converter::convertToUTF8(std::string_view sv) {
	std::scoped_lock l(m_lock);
	icu::UnicodeString ret(sv.data(), -1, m_converter.get(), m_error);
	if (m_error.isFailure()) throw std::runtime_error("Couldn't convert string: " + std::string(sv) + " to UTF-8. Error: " + std::to_string(m_error.get()) + ": " + m_error.errorName());
	return ret;
}

Converter& UnicodeUtil::getConverter(std::string const& s) {
	return m_converters.try_emplace(s, Converter(s)).first->second;
}

std::string UnicodeUtil::getCharset (std::string_view& str) {
	int bytes_consumed;
	bool is_reliable;
	if (removeUTF8BOM(str)) return "UTF-8";

	Encoding encoding = CompactEncDet::DetectEncoding(
		str.data(), static_cast<int>(str.size()),
		nullptr, nullptr, nullptr,
		UNKNOWN_ENCODING,
		UNKNOWN_LANGUAGE,
		CompactEncDet::WEB_CORPUS,
		true,
		&bytes_consumed,
		&is_reliable);

	if (!is_reliable) {
			std::clog << "unicode/warning: detected encoding (" <<
			MimeEncodingName(encoding) << ") for text: " <<
			((str.size() <= 256) ? str : str.substr(0,255)) <<
			" was flagged as not reliable." <<
			std::endl; // Magic number, so sue me.
		}
	return MimeEncodingName(encoding);
}

std::string UnicodeUtil::convertToUTF8 (std::string_view str, std::string _filename, CaseMapping toCase, bool assumeUTF8) {
	icu::UnicodeString ustring;
	std::string charset;
	if (assumeUTF8) charset = "UTF-8";
	else charset = UnicodeUtil::getCharset(str);
		if (charset != "UTF-8") {
			if (!_filename.empty()) std::clog << "unicode/info: " << _filename << " does not appear to be UTF-8; (" << charset << ") detected." << std::endl; 
			ustring = UnicodeUtil::getConverter(charset).convertToUTF8(str);
		}
	else { ustring = icu::UnicodeString::fromUTF8(str.data()); }
	switch(toCase) {
		case CaseMapping::UPPER:
			ustring.toUpper();
			break;
		case CaseMapping::LOWER:
			ustring.toLower();
			break;
		case CaseMapping::TITLE:
			ustring.toTitle(0, icu::Locale(TranslationEngine::getCurrentLanguageCode().c_str()), U_TITLECASE_NO_LOWERCASE);
			break;
		case CaseMapping::NONE:
			break;
	}
	std::string ret;
	if (!ustring.isEmpty()) {
		ustring.toUTF8String(ret);
	}
	else {
		if (!ret.empty()) {
			std::clog << "unicode/error: tried to convert text in an unknown encoding: " << charset << std::endl;
		}
	}
	return ret;
}

bool UnicodeUtil::removeUTF8BOM(std::string_view& str) {
	// Test for UTF-8 BOM (a three-byte sequence at the beginning of a file)
	if (str.substr(0, 3) == "\xEF\xBB\xBF") {
		str = str.substr(3); // Remove BOM if there is one
		return true;
	}
	return false;
}

bool UnicodeUtil::caseEqual (std::string_view lhs, std::string_view rhs, bool assumeUTF8) {
	if (lhs == rhs) return true; // Early return
	std::string lhsCharset = UnicodeUtil::getCharset(lhs);
	std::string rhsCharset = UnicodeUtil::getCharset(rhs);;
	icu::UnicodeString lhsUniString;
	icu::UnicodeString rhsUniString;
	if (lhsCharset != "UTF-8" && !assumeUTF8) {
		lhsUniString = UnicodeUtil::getConverter(lhsCharset).convertToUTF8(lhs);
	}
	else lhsUniString = icu::UnicodeString::fromUTF8(lhs.data());
	if (rhsCharset != "UTF-8" && !assumeUTF8) {
		rhsUniString = UnicodeUtil::getConverter(rhsCharset).convertToUTF8(rhs);
	}
	else rhsUniString = icu::UnicodeString::fromUTF8(rhs.data());
	int8_t result = lhsUniString.caseCompare(rhsUniString, U_FOLD_CASE_DEFAULT);
	return (result == 0);
}

bool UnicodeUtil::isRTL(std::string_view str) {
	bool _return = false;
	icu::ErrorCode _unicodeError;
	std::string charset(UnicodeUtil::getCharset(str));
	icu::UnicodeString ustring = UnicodeUtil::getConverter(charset).convertToUTF8(str);
	 std::unique_ptr<UBiDi,void(*)(UBiDi*)> _uBiDiObj(
		ubidi_open(),
		[](UBiDi* p) {
			if (p != nullptr) {
			ubidi_close(p);
			}
		});
	ubidi_setPara(
		_uBiDiObj.get(),
		ustring.getBuffer(),
		-1,
		UBIDI_DEFAULT_LTR,
		nullptr,
		_unicodeError
	);
	if (_unicodeError.isSuccess()) _return = (ubidi_getDirection(_uBiDiObj.get()) != UBIDI_LTR);
	else {
		std::clog << "unicode/warning: Error (" << std::to_string(_unicodeError.get()) << ": " << _unicodeError.errorName() << "), determining text direction for: " << str << ", will assume LTR." << std::endl;
		}
	return _return;
}

std::string UnicodeUtil::toLower (std::string_view str) {
	return convertToUTF8 (str, "", CaseMapping::LOWER);
}

std::string UnicodeUtil::toUpper (std::string_view str) {
	return convertToUTF8 (str, "", CaseMapping::UPPER);
}

std::string UnicodeUtil::toTitle (std::string_view str) {
	return convertToUTF8 (str, "", CaseMapping::TITLE);
}

void UnicodeUtil::collate (songMetadata& stringmap) {
	for (auto const& [key, value]: stringmap) { 
		ConfigItem::StringList termsToCollate = config["game/sorting_ignore"].sl();
		std::string pattern = std::string ("^((");
		for (auto term : termsToCollate) {
			if (term != termsToCollate.front()) {
				pattern += std::string("|");
			}
			pattern += term;
			if (term == termsToCollate.back()) {
				pattern += std::string(")\\s(.+))$");
			}
		}
		std::string collatedString = regex_replace(convertToUTF8(value),
		std::regex(pattern, std::regex_constants::icase), "$3,$2");
		stringmap[key] = collatedString;
	}
}