File: Fix-buffer-overflow-in-parse_stream-when-filehandle-.patch

package info (click to toggle)
libxml-parser-perl 2.47-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,396 kB
  • sloc: xml: 3,937; perl: 2,026; makefile: 38; ansic: 27
file content (108 lines) | stat: -rw-r--r-- 3,764 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
From: Toddr Bot <toddbot@rinaldo.us>
Date: Mon, 16 Mar 2026 20:55:31 +0000
Subject: Fix buffer overflow in parse_stream when filehandle has :utf8 layer
Origin: https://github.com/cpan-authors/XML-Parser/commit/5361c2b7f48599718cdecbe50c5fdd88b28ffd79

When a filehandle has a :utf8 PerlIO layer, Perl's read() returns
decoded characters, but SvPV() gives back the UTF-8 byte
representation which can be larger than the pre-allocated XML buffer.
Previously this caused heap corruption (double free / buffer overflow),
and a later workaround (BUFSIZE * 6 + croak) prevented the corruption
but still crashed.

Fix by re-obtaining the expat buffer at the actual byte size when the
read produces more bytes than initially allocated. This handles UTF-8
streams gracefully without wasting memory on an oversized buffer.

Fixes https://github.com/cpan-authors/XML-Parser/issues/64
(migrated from rt.cpan.org #19859)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Expat/Expat.xs  | 15 +++++++++++----
 t/utf8_stream.t | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 4 deletions(-)
 create mode 100644 t/utf8_stream.t

diff --git a/Expat/Expat.xs b/Expat/Expat.xs
index 32fdce57ae4b..3cd1154886e7 100644
--- a/Expat/Expat.xs
+++ b/Expat/Expat.xs
@@ -343,8 +343,8 @@ parse_stream(XML_Parser parser, SV * ioref)
   }
   else {
     tbuff = newSV(0);
-    tsiz = newSViv(BUFSIZE); /* in UTF-8 characters */
-    buffsize = BUFSIZE * 6; /* in bytes that encode an UTF-8 string */
+    tsiz = newSViv(BUFSIZE);
+    buffsize = BUFSIZE;
   }
 
   while (! done)
@@ -387,8 +387,15 @@ parse_stream(XML_Parser parser, SV * ioref)
 
 	tb = SvPV(tbuff, br);
 	if (br > 0) {
-	  if (br > buffsize)
-	    croak("The input buffer is not large enough for read UTF-8 decoded string");
+	  if (br > buffsize) {
+	    /* The byte count from SvPV can exceed buffsize when the
+	       filehandle has a :utf8 layer, since Perl reads buffsize
+	       characters but multi-byte UTF-8 chars produce more bytes.
+	       Re-obtain the buffer at the required size. */
+	    buffer = XML_GetBuffer(parser, br);
+	    if (! buffer)
+	      croak("Ran out of memory for input buffer");
+	  }
 	  Copy(tb, buffer, br, char);
 	} else
 	  done = 1;
diff --git a/t/utf8_stream.t b/t/utf8_stream.t
new file mode 100644
index 000000000000..a7e55f78d78c
--- /dev/null
+++ b/t/utf8_stream.t
@@ -0,0 +1,40 @@
+BEGIN { print "1..2\n"; }
+END { print "not ok 1\n" unless $loaded; }
+use XML::Parser;
+$loaded = 1;
+print "ok 1\n";
+
+################################################################
+# Test parsing from a filehandle with :utf8 layer
+# Regression test for rt.cpan.org #19859 / GitHub issue #64
+# A UTF-8 stream caused buffer overflow because SvPV byte count
+# could exceed the pre-allocated XML_GetBuffer size.
+
+use File::Temp qw(tempfile);
+
+# Create a temp file with UTF-8 XML content containing multi-byte chars
+my ($fh, $tmpfile) = tempfile(UNLINK => 1);
+binmode($fh, ':raw');
+# Write raw UTF-8 bytes: XML with Chinese characters (3 bytes each in UTF-8)
+# U+4E16 U+754C (世界 = "world") repeated to create substantial multi-byte content
+my $body = "\xe4\xb8\x96\xe7\x95\x8c" x 20000;  # 120000 bytes / 40000 chars of 3-byte UTF-8
+print $fh qq(<?xml version="1.0" encoding="UTF-8"?>\n<doc>$body</doc>\n);
+close($fh);
+
+my $text = '';
+my $parser = XML::Parser->new(
+    Handlers => {
+        Char => sub { $text .= $_[1]; },
+    }
+);
+
+# Open with :utf8 layer - this is what triggers the bug
+open(my $in, '<:utf8', $tmpfile) or die "Cannot open $tmpfile: $!";
+eval { $parser->parse($in); };
+close($in);
+
+if ($@ eq '' && length($text) > 0) {
+    print "ok 2\n";
+} else {
+    print "not ok 2 # $@\n";
+}
-- 
2.53.0