From: Zdenek Hutyra <zhutyra@centrum.cz>
Date: Thu, 21 Nov 2024 10:04:17 +0000
Subject: Prevent Unicode decoding overrun
Origin: https://git.ghostscript.com/?p=ghostpdl.git;a=commit;h=d6e713dda4f8d75c6a4ed8c7568a0d4f532dcb17
Bug: https://bugs.ghostscript.com/show_bug.cgi?id=708132
Bug-Debian-Security: https://security-tracker.debian.org/tracker/CVE-2025-27831

Bug #708132 "Text buffer overflow with long characters"

The txt_get_unicode function was copying too few bytes from the
fixed glyph name to unicode mapping tables. This was probably
causing incorrect Unicode code points in relatively rare cases but
not otherwise a problem.

However, a badly formed GlyphNames2Unicode array attached to a font
could cause the decoding to spill over the assigned buffer.

We really should rewrite the Unicode handling, but until we do just
checking that the length is no more than 4 Unicode code points is
enough to prevent an overrun. All the current clients allocate at least
4 code points per character code.

Added a comment to explain the magic number.

CVE-2025-27831
---
 devices/vector/doc_common.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/devices/vector/doc_common.c b/devices/vector/doc_common.c
index 690f8eaed7d8..05fb3d51f146 100644
--- a/devices/vector/doc_common.c
+++ b/devices/vector/doc_common.c
@@ -479,7 +479,7 @@ int txt_get_unicode(gx_device *dev, gs_font *font, gs_glyph glyph, gs_char ch, u
                     }
                     if (strlen(dentry->Glyph) == gnstr.size) {
                         if(memcmp(gnstr.data, dentry->Glyph, gnstr.size) == 0) {
-                            memcpy(Buffer, dentry->Unicode, 2);
+                            memcpy(Buffer, dentry->Unicode, 2 * sizeof(unsigned short));
                             return 2;
                         }
                     }
@@ -497,7 +497,7 @@ int txt_get_unicode(gx_device *dev, gs_font *font, gs_glyph glyph, gs_char ch, u
                     }
                     if (strlen(tentry->Glyph) == gnstr.size) {
                         if(memcmp(gnstr.data, tentry->Glyph, gnstr.size) == 0) {
-                            memcpy(Buffer, tentry->Unicode, 3);
+                            memcpy(Buffer, tentry->Unicode, 3 * sizeof(unsigned short));
                             return 3;
                         }
                     }
@@ -515,7 +515,7 @@ int txt_get_unicode(gx_device *dev, gs_font *font, gs_glyph glyph, gs_char ch, u
                     }
                     if (strlen(qentry->Glyph) == gnstr.size) {
                         if(memcmp(gnstr.data, qentry->Glyph, gnstr.size) == 0) {
-                            memcpy(Buffer, qentry->Unicode, 4);
+                            memcpy(Buffer, qentry->Unicode, 4 * sizeof(unsigned short));
                             return 4;
                         }
                     }
@@ -527,12 +527,16 @@ int txt_get_unicode(gx_device *dev, gs_font *font, gs_glyph glyph, gs_char ch, u
         return 1;
     } else {
         char *b, *u;
-        int l = length - 1;
+        int l;
 
         /* Real Unicode values should be at least 2 bytes. In fact I think the code assumes exactly
          * 2 bytes. If we got an odd number, give up and return the character code.
+         *
+         * The magic number here is due to the clients calling this code. Currently txtwrite and docxwrite
+         * allow up to 4 Unicode values per character/glyph, if the length would exceed that we can't
+         * write it. For now, again, fall back to the character code.
          */
-        if (length & 1) {
+        if (length & 1 || length > 4 * sizeof(unsigned short)) {
             *Buffer = fallback;
             return 1;
         }
-- 
2.49.0