1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
|
#include <qpdf/QPDFObject_private.hh>
#include <qpdf/QPDFObjectHandle_private.hh>
#include <qpdf/QUtil.hh>
#include <qpdf/Util.hh>
// DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
// including it in case it may accidentally be used.
static bool
is_iso_latin1_printable(char ch)
{
return (ch >= 32 && ch <= 126) || static_cast<unsigned char>(ch) >= 160;
}
void
QPDF_String::writeJSON(int json_version, JSON::Writer& p)
{
if (json_version == 1) {
if (util::is_utf16(val)) {
p << "\"" << JSON::Writer::encode_string(QUtil::utf16_to_utf8(val)) << "\"";
return;
}
if (util::is_explicit_utf8(val)) {
// PDF 2.0 allows UTF-8 strings when explicitly prefixed with the three-byte
// representation of U+FEFF.
p << "\"" << JSON::Writer::encode_string(val.substr(3)) << "\"";
return;
}
p << "\"" << JSON::Writer::encode_string(QUtil::pdf_doc_to_utf8(val)) << "\"";
return;
}
// See if we can unambiguously represent as Unicode.
if (util::is_utf16(val)) {
p << "\"u:" << JSON::Writer::encode_string(QUtil::utf16_to_utf8(val)) << "\"";
return;
}
// See if we can unambiguously represent as Unicode.
if (util::is_explicit_utf8(val)) {
p << "\"u:" << JSON::Writer::encode_string(val.substr(3)) << "\"";
return;
}
if (!useHexString()) {
auto candidate = QUtil::pdf_doc_to_utf8(val);
std::string test;
if (QUtil::utf8_to_pdf_doc(candidate, test, '?') && test == val) {
// This is a PDF-doc string that can be losslessly encoded as Unicode.
p << "\"u:" << JSON::Writer::encode_string(candidate) << "\"";
return;
}
}
p << "\"b:" << QUtil::hex_encode(val) << "\"";
}
bool
QPDF_String::useHexString() const
{
// Heuristic: use the hexadecimal representation of a string if there are any non-printable (in
// PDF Doc encoding) characters or if too large of a proportion of the string consists of
// non-ASCII characters.
unsigned int non_ascii = 0;
for (auto const ch: val) {
if (ch > 126) {
++non_ascii;
} else if (ch >= 32) {
continue;
} else if (ch < 0 || ch >= 24) {
++non_ascii;
} else if (!(ch == '\n' || ch == '\r' || ch == '\t' || ch == '\b' || ch == '\f')) {
return true;
}
}
return 5 * non_ascii > val.length();
}
std::string
QPDF_String::unparse(bool force_binary)
{
bool use_hexstring = force_binary || useHexString();
std::string result;
if (use_hexstring) {
static auto constexpr hexchars = "0123456789abcdef";
result.reserve(2 * val.length() + 2);
result += '<';
for (const char c: val) {
result += hexchars[static_cast<unsigned char>(c) >> 4];
result += hexchars[c & 0x0f];
}
result += '>';
} else {
result += "(";
for (unsigned int i = 0; i < val.length(); ++i) {
char ch = val.at(i);
switch (ch) {
case '\n':
result += "\\n";
break;
case '\r':
result += "\\r";
break;
case '\t':
result += "\\t";
break;
case '\b':
result += "\\b";
break;
case '\f':
result += "\\f";
break;
case '(':
result += "\\(";
break;
case ')':
result += "\\)";
break;
case '\\':
result += "\\\\";
break;
default:
if (is_iso_latin1_printable(ch)) {
result += val.at(i);
} else {
result += "\\" +
QUtil::int_to_string_base(
static_cast<int>(static_cast<unsigned char>(ch)), 8, 3);
}
break;
}
}
result += ")";
}
return result;
}
|