From 61793762daa355dc7a2b5edd104ef38efb3efb2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Langa?= <lukasz@langa.pl>
Date: Wed, 4 Sep 2024 17:37:28 +0200
Subject: [3.11] gh-121650: Encode newlines in headers, and verify headers are
 sound (GH-122233) (#122608)

Per RFC 2047:

> [...] these encoding schemes allow the
> encoding of arbitrary octet values, mail readers that implement this
> decoding should also ensure that display of the decoded data on the
> recipient's terminal will not cause unwanted side-effects

It seems that the "quoted-word" scheme is a valid way to include
a newline character in a header value, just like we already allow
undecodable bytes or control characters.
They do need to be properly quoted when serialized to text, though.

Verify that email headers are well-formed.

This should fail for custom fold() implementations that aren't careful
about newlines.

(cherry picked from commit 097633981879b3c9de9a1dd120d3aa585ecc2384)

Co-authored-by: Petr Viktorin <encukou@gmail.com>
Co-authored-by: Bas Bloemsaat <bas@bloemsaat.org>
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
---
 Doc/library/email.errors.rst          |  5 +++
 Doc/library/email.policy.rst          | 18 ++++++++
 Lib/email/_header_value_parser.py     | 12 ++++--
 Lib/email/_policybase.py              |  8 ++++
 Lib/email/errors.py                   |  4 ++
 Lib/email/generator.py                | 13 +++++-
 Lib/test/test_email/test_generator.py | 62 +++++++++++++++++++++++++++
 Lib/test/test_email/test_policy.py    | 26 +++++++++++
 8 files changed, 144 insertions(+), 4 deletions(-)

diff --git a/Doc/library/email.errors.rst b/Doc/library/email.errors.rst
index 194a98696f4..a98a2777310 100644
--- a/Doc/library/email.errors.rst
+++ b/Doc/library/email.errors.rst
@@ -58,6 +58,11 @@ The following exception classes are defined in the :mod:`email.errors` module:
    :class:`~email.mime.nonmultipart.MIMENonMultipart` (e.g.
    :class:`~email.mime.image.MIMEImage`).
 
+.. exception:: HeaderWriteError()
+
+   Raised when an error occurs when the :mod:`~email.generator` outputs
+   headers.
+
 
 Here is the list of the defects that the :class:`~email.parser.FeedParser`
 can find while parsing messages.  Note that the defects are added to the message
diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst
index bf53b9520fc..f5ed3d20b0e 100644
--- a/Doc/library/email.policy.rst
+++ b/Doc/library/email.policy.rst
@@ -229,6 +229,24 @@ added matters.  To illustrate::
 
       .. versionadded:: 3.6
 
+
+   .. attribute:: verify_generated_headers
+
+      If ``True`` (the default), the generator will raise
+      :exc:`~email.errors.HeaderWriteError` instead of writing a header
+      that is improperly folded or delimited, such that it would
+      be parsed as multiple headers or joined with adjacent data.
+      Such headers can be generated by custom header classes or bugs
+      in the ``email`` module.
+
+      As it's a security feature, this defaults to ``True`` even in the
+      :class:`~email.policy.Compat32` policy.
+      For backwards compatible, but unsafe, behavior, it must be set to
+      ``False`` explicitly.
+
+      .. versionadded:: 3.11.2-6+deb12u5
+
+
    The following :class:`Policy` method is intended to be called by code using
    the email library to create policy instances with custom settings:
 
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index e637e6df066..e1b99d5b417 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -92,6 +92,8 @@
 ASPECIALS = TSPECIALS | set("*'%")
 ATTRIBUTE_ENDS = ASPECIALS | WSP
 EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
+NLSET = {'\n', '\r'}
+SPECIALSNL = SPECIALS | NLSET
 
 def quote_string(value):
     return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
@@ -2778,9 +2780,13 @@ def _refold_parse_tree(parse_tree, *, policy):
             wrap_as_ew_blocked -= 1
             continue
         tstr = str(part)
-        if part.token_type == 'ptext' and set(tstr) & SPECIALS:
-            # Encode if tstr contains special characters.
-            want_encoding = True
+        if not want_encoding:
+            if part.token_type == 'ptext':
+                # Encode if tstr contains special characters.
+                want_encoding = not SPECIALSNL.isdisjoint(tstr)
+            else:
+                # Encode if tstr contains newlines.
+                want_encoding = not NLSET.isdisjoint(tstr)
         try:
             tstr.encode(encoding)
             charset = encoding
diff --git a/Lib/email/_policybase.py b/Lib/email/_policybase.py
index c9cbadd2a80..d1f48211f90 100644
--- a/Lib/email/_policybase.py
+++ b/Lib/email/_policybase.py
@@ -157,6 +157,13 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta):
     message_factory     -- the class to use to create new message objects.
                            If the value is None, the default is Message.
 
+    verify_generated_headers
+                        -- if true, the generator verifies that each header
+                           they are properly folded, so that a parser won't
+                           treat it as multiple headers, start-of-body, or
+                           part of another header.
+                           This is a check against custom Header & fold()
+                           implementations.
     """
 
     raise_on_defect = False
@@ -165,6 +172,7 @@ class Policy(_PolicyBase, metaclass=abc.ABCMeta):
     max_line_length = 78
     mangle_from_ = False
     message_factory = None
+    verify_generated_headers = True
 
     def handle_defect(self, obj, defect):
         """Based on policy, either raise defect or call register_defect.
diff --git a/Lib/email/errors.py b/Lib/email/errors.py
index 3ad00565549..02aa5eced6a 100644
--- a/Lib/email/errors.py
+++ b/Lib/email/errors.py
@@ -29,6 +29,10 @@ class CharsetError(MessageError):
     """An illegal charset was given."""
 
 
+class HeaderWriteError(MessageError):
+    """Error while writing headers."""
+
+
 # These are parsing defects which the parser was able to work around.
 class MessageDefect(ValueError):
     """Base class for a message defect."""
diff --git a/Lib/email/generator.py b/Lib/email/generator.py
index c9b121624e0..89224ae41cb 100644
--- a/Lib/email/generator.py
+++ b/Lib/email/generator.py
@@ -14,12 +14,14 @@
 from copy import deepcopy
 from io import StringIO, BytesIO
 from email.utils import _has_surrogates
+from email.errors import HeaderWriteError
 
 UNDERSCORE = '_'
 NL = '\n'  # XXX: no longer used by the code below.
 
 NLCRE = re.compile(r'\r\n|\r|\n')
 fcre = re.compile(r'^From ', re.MULTILINE)
+NEWLINE_WITHOUT_FWSP = re.compile(r'\r\n[^ \t]|\r[^ \n\t]|\n[^ \t]')
 
 
 
@@ -223,7 +225,16 @@ def _dispatch(self, msg):
 
     def _write_headers(self, msg):
         for h, v in msg.raw_items():
-            self.write(self.policy.fold(h, v))
+            folded = self.policy.fold(h, v)
+            if self.policy.verify_generated_headers:
+                linesep = self.policy.linesep
+                if not folded.endswith(self.policy.linesep):
+                    raise HeaderWriteError(
+                        f'folded header does not end with {linesep!r}: {folded!r}')
+                if NEWLINE_WITHOUT_FWSP.search(folded.removesuffix(linesep)):
+                    raise HeaderWriteError(
+                        f'folded header contains newline: {folded!r}')
+            self.write(folded)
         # A blank line always separates headers from body
         self.write(self._NL)
 
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py
index 89e7edeb63a..d29400f0ed1 100644
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -6,6 +6,7 @@
 from email.generator import Generator, BytesGenerator
 from email.headerregistry import Address
 from email import policy
+import email.errors
 from test.test_email import TestEmailBase, parameterize
 
 
@@ -216,6 +217,44 @@ def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self):
         g.flatten(msg)
         self.assertEqual(s.getvalue(), self.typ(expected))
 
+    def test_keep_encoded_newlines(self):
+        msg = self.msgmaker(self.typ(textwrap.dedent("""\
+            To: nobody
+            Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com
+
+            None
+            """)))
+        expected = textwrap.dedent("""\
+            To: nobody
+            Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com
+
+            None
+            """)
+        s = self.ioclass()
+        g = self.genclass(s, policy=self.policy.clone(max_line_length=80))
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), self.typ(expected))
+
+    def test_keep_long_encoded_newlines(self):
+        msg = self.msgmaker(self.typ(textwrap.dedent("""\
+            To: nobody
+            Subject: Bad subject=?UTF-8?Q?=0A?=Bcc: injection@example.com
+
+            None
+            """)))
+        expected = textwrap.dedent("""\
+            To: nobody
+            Subject: Bad subject
+             =?utf-8?q?=0A?=Bcc:
+             injection@example.com
+
+            None
+            """)
+        s = self.ioclass()
+        g = self.genclass(s, policy=self.policy.clone(max_line_length=30))
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), self.typ(expected))
+
 
 class TestGenerator(TestGeneratorBase, TestEmailBase):
 
@@ -224,6 +263,29 @@ class TestGenerator(TestGeneratorBase, TestEmailBase):
     ioclass = io.StringIO
     typ = str
 
+    def test_verify_generated_headers(self):
+        """gh-121650: by default the generator prevents header injection"""
+        class LiteralHeader(str):
+            name = 'Header'
+            def fold(self, **kwargs):
+                return self
+
+        for text in (
+            'Value\r\nBad Injection\r\n',
+            'NoNewLine'
+        ):
+            with self.subTest(text=text):
+                message = message_from_string(
+                    "Header: Value\r\n\r\nBody",
+                    policy=self.policy,
+                )
+
+                del message['Header']
+                message['Header'] = LiteralHeader(text)
+
+                with self.assertRaises(email.errors.HeaderWriteError):
+                    message.as_string()
+
 
 class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
 
diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py
index e87c2755494..ff1ddf7d7a8 100644
--- a/Lib/test/test_email/test_policy.py
+++ b/Lib/test/test_email/test_policy.py
@@ -26,6 +26,7 @@ class PolicyAPITests(unittest.TestCase):
         'raise_on_defect':          False,
         'mangle_from_':             True,
         'message_factory':          None,
+        'verify_generated_headers': True,
         }
     # These default values are the ones set on email.policy.default.
     # If any of these defaults change, the docs must be updated.
@@ -277,6 +278,31 @@ def test_short_maxlen_error(self):
                 with self.assertRaises(email.errors.HeaderParseError):
                     policy.fold("Subject", subject)
 
+    def test_verify_generated_headers(self):
+        """Turning protection off allows header injection"""
+        policy = email.policy.default.clone(verify_generated_headers=False)
+        for text in (
+            'Header: Value\r\nBad: Injection\r\n',
+            'Header: NoNewLine'
+        ):
+            with self.subTest(text=text):
+                message = email.message_from_string(
+                    "Header: Value\r\n\r\nBody",
+                    policy=policy,
+                )
+                class LiteralHeader(str):
+                    name = 'Header'
+                    def fold(self, **kwargs):
+                        return self
+
+                del message['Header']
+                message['Header'] = LiteralHeader(text)
+
+                self.assertEqual(
+                    message.as_string(),
+                    f"{text}\nBody",
+                )
+
     # XXX: Need subclassing tests.
     # For adding subclassed objects, make sure the usual rules apply (subclass
     # wins), but that the order still works (right overrides left).
-- 
2.30.2