From: John MacFarlane <jgm@berkeley.edu>
Date: Tue, 20 Jun 2023 13:50:13 -0700
Subject: Fix a security vulnerability in MediaBag and
 T.P.Class.IO.writeMedia.

This vulnerability, discovered by Entroy C, allows users to write
arbitrary files to any location by feeding pandoc a specially crafted
URL in an image element.  The vulnerability is serious for anyone
using pandoc to process untrusted input.

Origin: https://github.com/jgm/pandoc/commit/5e381e3878b5da87ee7542f7e51c3c1a7fd84b89
Origin: https://github.com/jgm/pandoc/commit/54561e9a6667b36a8452b01d2def9e3642013dd6
Origin: https://github.com/jgm/pandoc/commit/df4f13b262f7be5863042f8a5a1c365282c81f07
Origin: https://github.com/jgm/pandoc/commit/fe62da61dfd33e6b4c0c03895c528a47a0405bf7
Origin: https://github.com/jgm/pandoc/commit/5246f02f0bb9c176a6d2f6e3d0c03407d8a67445
Bug: https://github.com/jgm/pandoc/security/advisories/GHSA-xj5q-fv23-575g
Bug-Debian: https://security-tracker.debian.org/tracker/CVE-2023-35936
---
 pandoc.cabal                         |  1 +
 src/Text/Pandoc/Class/PandocIO.hs    | 12 ++++++------
 src/Text/Pandoc/Class/PandocMonad.hs |  2 +-
 test/Tests/MediaBag.hs               | 37 ++++++++++++++++++++++++++++++++++++
 test/test-pandoc.hs                  |  2 ++
 5 files changed, 47 insertions(+), 7 deletions(-)
 create mode 100644 test/Tests/MediaBag.hs

diff --git a/pandoc.cabal b/pandoc.cabal
index a8b9f8e..04ab218 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -762,6 +762,7 @@ test-suite test-pandoc
                   Tests.Lua
                   Tests.Lua.Module
                   Tests.Shared
+                  Tests.MediaBag
                   Tests.Readers.LaTeX
                   Tests.Readers.HTML
                   Tests.Readers.JATS
diff --git a/src/Text/Pandoc/Class/PandocIO.hs b/src/Text/Pandoc/Class/PandocIO.hs
index 1cbfd68..0472816 100644
--- a/src/Text/Pandoc/Class/PandocIO.hs
+++ b/src/Text/Pandoc/Class/PandocIO.hs
@@ -57,7 +57,7 @@ import Network.HTTP.Client.Internal (addProxy)
 import Network.HTTP.Client.TLS (tlsManagerSettings)
 import Network.HTTP.Types.Header ( hContentType )
 import Network.Socket (withSocketsDo)
-import Network.URI ( unEscapeString )
+import Network.URI (URI(..), parseURI, unEscapeString)
 import Prelude
 import System.Directory (createDirectoryIfMissing)
 import System.Environment (getEnv)
@@ -131,11 +131,11 @@ instance PandocMonad PandocIO where
   newUniqueHash = hashUnique <$> liftIO IO.newUnique
 
   openURL u
-   | Just u'' <- T.stripPrefix "data:" u = do
-       let mime     = T.takeWhile (/=',') u''
-       let contents = UTF8.fromString $
-                       unEscapeString $ T.unpack $ T.drop 1 $ T.dropWhile (/=',') u''
-       return (decodeLenient contents, Just mime)
+   | Just (URI{ uriScheme = "data:",
+                uriPath = upath }) <- parseURI (T.unpack u) = do
+       let (mime, rest) = break (== ',') $ unEscapeString upath
+       let contents = UTF8.fromString $ drop 1 rest
+       return (decodeLenient contents, Just (T.pack mime))
    | otherwise = do
        let toReqHeader (n, v) = (CI.mk (UTF8.fromText n), UTF8.fromText v)
        customHeaders <- map toReqHeader <$> getsCommonState stRequestHeaders
diff --git a/src/Text/Pandoc/Class/PandocMonad.hs b/src/Text/Pandoc/Class/PandocMonad.hs
index 8229668..eb6bedd 100644
--- a/src/Text/Pandoc/Class/PandocMonad.hs
+++ b/src/Text/Pandoc/Class/PandocMonad.hs
@@ -612,7 +612,7 @@ fetchMediaResource :: PandocMonad m
               => T.Text -> m (FilePath, Maybe MimeType, BL.ByteString)
 fetchMediaResource src = do
   (bs, mt) <- downloadOrRead src
-  let ext = fromMaybe (T.pack $ takeExtension $ T.unpack src)
+  let ext = fromMaybe (T.pack $ takeExtension $ unEscapeString $ T.unpack src)
                       (mt >>= extensionFromMimeType)
   let bs' = BL.fromChunks [bs]
   let basename = showDigest $ sha1 bs'
diff --git a/test/Tests/MediaBag.hs b/test/Tests/MediaBag.hs
new file mode 100644
index 0000000..8a57337
--- /dev/null
+++ b/test/Tests/MediaBag.hs
@@ -0,0 +1,37 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Tests.MediaBag (tests) where
+
+import Test.Tasty
+import Test.Tasty.HUnit
+-- import Tests.Helpers
+import Text.Pandoc.Class (extractMedia, fillMediaBag, runIOorExplode)
+import System.IO.Temp (withTempDirectory)
+import Text.Pandoc.Shared (inDirectory)
+import System.FilePath
+import Text.Pandoc.Builder as B
+import System.Directory (doesFileExist, copyFile)
+
+tests :: [TestTree]
+tests = [
+  testCase "test fillMediaBag & extractMedia" $
+      withTempDirectory "." "extractMediaTest" $ \tmpdir -> inDirectory tmpdir $ do
+        copyFile "../../test/bodybg.gif" "bodybg.gif"
+        let d = B.doc $
+                  B.para (B.image "../../test/lalune.jpg" "" mempty) <>
+                  B.para (B.image "bodybg.gif" "" mempty) <>
+                  B.para (B.image "data://image/png;base64,cHJpbnQgImhlbGxvIgo=;.lua+%2f%2e%2e%2f%2e%2e%2fa%2elua" "" mempty) <>
+                  B.para (B.image "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7" "" mempty)
+        runIOorExplode $ do
+          fillMediaBag d
+          extractMedia "foo" d
+        exists1 <- doesFileExist ("foo" </> "278e30c6961bc3e263c638fb15e114d35290db05.gif")
+        assertBool "file in directory is not extracted with hashed name" exists1
+        exists2 <- doesFileExist ("foo" </> "f9d88c3dbe18f6a7f5670e994a947d51216cdf0e.jpg")
+        assertBool "file above directory is not extracted with hashed name" exists2
+        exists3 <- doesFileExist ("foo" </> "2a0eaa89f43fada3e6c577beea4f2f8f53ab6a1d.lua")
+        exists4 <- doesFileExist "a.lua"
+        assertBool "data uri with malicious payload gets written outside of destination dir"
+          (exists3 && not exists4)
+        exists5 <- doesFileExist ("foo" </> "d5fceb6532643d0d84ffe09c40c481ecdf59e15a.gif")
+        assertBool "data uri with gif is not properly decoded" exists5
+  ]
diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs
index 9d64b61..b1804dd 100644
--- a/test/test-pandoc.hs
+++ b/test/test-pandoc.hs
@@ -44,6 +44,7 @@ import qualified Tests.Writers.Powerpoint
 import qualified Tests.Writers.RST
 import qualified Tests.Writers.TEI
 import Tests.Helpers (findPandoc)
+import qualified Tests.MediaBag
 import Text.Pandoc.Shared (inDirectory)
 
 tests :: FilePath -> TestTree
@@ -51,6 +52,7 @@ tests pandocPath = testGroup "pandoc tests"
         [ Tests.Command.tests pandocPath
         , testGroup "Old" (Tests.Old.tests pandocPath)
         , testGroup "Shared" Tests.Shared.tests
+        , testGroup "MediaBag" Tests.MediaBag.tests
         , testGroup "Writers"
           [ testGroup "Native" Tests.Writers.Native.tests
           , testGroup "ConTeXt" Tests.Writers.ConTeXt.tests
