From: Ken Sharp <ken.sharp@artifex.com>
Date: Tue, 27 Sep 2022 13:03:57 +0100
Subject: PCL interpreter - fix decode_glyph for Unicode
Origin: https://git.ghostscript.com/?p=ghostpdl.git;a=commit;h=bf79b61cb1677d6865c45d397435848a21e8a647

The text extraction (and pdfwrite family) expect that decode_glyph
should always return pairs of bytes (an assumption that Unicode code
points are 2 bytes), and the return value from the routine should be
the number of bytes required to hold the value.

The PCL decode_glyph routine however was simply returning 1, which
caused the text extraction code some difficulty since it wasn't
expecting that.

This commit firstly alters the text extraction code to cope 'better'
with a decode_glyph routine which returns an odd value (basically
ignore it and fall back to using the character code).

We also alter the pl_decode_glyph routine to return 2 instead of 1,
so that it correctly tells the caller that it is returning 2 bytes.
Finally we make sure that the returned value is big-endian, because the
text extraction code assumes it will be.
---
 devices/vector/doc_common.c |  8 ++++++++
 pcl/pl/plfont.c             | 12 +++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/devices/vector/doc_common.c b/devices/vector/doc_common.c
index 4575aea4a8c0..71d5422d379b 100644
--- a/devices/vector/doc_common.c
+++ b/devices/vector/doc_common.c
@@ -529,6 +529,14 @@ int txt_get_unicode(gx_device *dev, gs_font *font, gs_glyph glyph, gs_char ch, u
         char *b, *u;
         int l = length - 1;
 
+        /* Real Unicode values should be at least 2 bytes. In fact I think the code assumes exactly
+         * 2 bytes. If we got an odd number, give up and return the character code.
+         */
+        if (length & 1) {
+            *Buffer = fallback;
+            return 1;
+        }
+
         unicode = (ushort *)gs_alloc_bytes(dev->memory, length, "temporary Unicode array");
         length = font->procs.decode_glyph((gs_font *)font, glyph, ch, unicode, length);
 #if ARCH_IS_BIG_ENDIAN
diff --git a/pcl/pl/plfont.c b/pcl/pl/plfont.c
index ec53dd9bff2f..6fe045e2cd4f 100644
--- a/pcl/pl/plfont.c
+++ b/pcl/pl/plfont.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2001-2021 Artifex Software, Inc.
+/* Copyright (C) 2001-2022 Artifex Software, Inc.
    All Rights Reserved.
 
    This software is provided AS-IS with no warranty, either express or
@@ -465,15 +465,21 @@ pl_glyph_name(gs_font * pfont, gs_glyph glyph, gs_const_string * pstr)
 static int
 pl_decode_glyph(gs_font * font, gs_glyph glyph, int ch, ushort *unicode_return, unsigned int length)
 {
+    unsigned char *ucode = (unsigned char *)unicode_return;
 
     if (ch < 0 || ch > 255)
         return (int) GS_NO_CHAR;
 
     if (length == 0)
-        return 1;
+        return 2;
 
+#if ARCH_IS_BIG_ENDIAN
     *unicode_return = (ushort)ch;
-    return 1;
+#else
+    ucode[0] = 0x00;
+    ucode[1] = ch & 0xff;
+#endif
+    return 2;
 }
 
 /* ---------------- Width cache ---------------- */
-- 
2.49.0

