1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
|
{- GHC File system encoding handling.
-
- Copyright 2012-2021 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
{-# LANGUAGE CPP #-}
{-# OPTIONS_GHC -fno-warn-tabs #-}
module Utility.FileSystemEncoding (
useFileSystemEncoding,
fileEncoding,
RawFilePath,
fromRawFilePath,
toRawFilePath,
decodeBL,
encodeBL,
decodeBS,
encodeBS,
truncateFilePath,
) where
import qualified GHC.IO.Encoding as Encoding
import System.IO
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
#ifdef mingw32_HOST_OS
import qualified Data.ByteString.UTF8 as S8
import qualified Data.ByteString.Lazy.UTF8 as L8
#else
import qualified GHC.Foreign as GHC
import System.IO.Unsafe
import Data.ByteString.Unsafe (unsafePackMallocCStringLen)
import Data.Char
import Data.List
#endif
-- | A literal file path
type RawFilePath = S.ByteString
{- Makes all subsequent Handles that are opened, as well as stdio Handles,
- use the filesystem encoding, instead of the encoding of the current
- locale.
-
- The filesystem encoding allows "arbitrary undecodable bytes to be
- round-tripped through it". This avoids encoded failures when data is not
- encoded matching the current locale.
-
- Note that code can still use hSetEncoding to change the encoding of a
- Handle. This only affects the default encoding.
-}
useFileSystemEncoding :: IO ()
useFileSystemEncoding = do
#ifndef mingw32_HOST_OS
e <- Encoding.getFileSystemEncoding
#else
{- The file system encoding does not work well on Windows,
- and Windows only has utf FilePaths anyway. -}
let e = Encoding.utf8
#endif
hSetEncoding stdin e
hSetEncoding stdout e
hSetEncoding stderr e
Encoding.setLocaleEncoding e
fileEncoding :: Handle -> IO ()
#ifndef mingw32_HOST_OS
fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding
#else
fileEncoding h = hSetEncoding h Encoding.utf8
#endif
{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
decodeBL :: L.ByteString -> FilePath
#ifndef mingw32_HOST_OS
decodeBL = decodeBS . L.toStrict
#else
{- On Windows, we assume that the ByteString is utf-8, since Windows
- only uses unicode for filenames. -}
decodeBL = L8.toString
#endif
{- Encodes a FilePath into a ByteString, applying the filesystem encoding. -}
encodeBL :: FilePath -> L.ByteString
#ifndef mingw32_HOST_OS
encodeBL = L.fromStrict . encodeBS
#else
encodeBL = L8.fromString
#endif
decodeBS :: S.ByteString -> FilePath
#ifndef mingw32_HOST_OS
-- This does the same thing as System.FilePath.ByteString.decodeFilePath,
-- with an identical implementation.
{-# NOINLINE decodeBS #-}
decodeBS b = unsafePerformIO $ do
enc <- Encoding.getFileSystemEncoding
S.useAsCStringLen b (GHC.peekCStringLen enc)
#else
decodeBS = S8.toString
#endif
encodeBS :: FilePath -> S.ByteString
#ifndef mingw32_HOST_OS
-- This does the same thing as System.FilePath.ByteString.encodeFilePath,
-- with an identical implementation.
{-# NOINLINE encodeBS #-}
encodeBS f = unsafePerformIO $ do
enc <- Encoding.getFileSystemEncoding
GHC.newCStringLen enc f >>= unsafePackMallocCStringLen
#else
encodeBS = S8.fromString
#endif
fromRawFilePath :: RawFilePath -> FilePath
fromRawFilePath = decodeBS
toRawFilePath :: FilePath -> RawFilePath
toRawFilePath = encodeBS
{- Truncates a path to the given number of bytes (or less),
- as represented on disk.
-
- Avoids returning an invalid part of a unicode byte sequence, at the
- cost of efficiency when running on a large FilePath.
-
- Note that this may return ""! That can happen if it is asked to truncate
- to eg 1 byte, but the input path starts with a unicode byte sequence.
-}
truncateFilePath :: Int -> RawFilePath -> RawFilePath
#ifndef mingw32_HOST_OS
{- On unix, do not assume a unicode locale, but does assume ascii
- characters are a single byte. -}
truncateFilePath n b =
let blen = S.length b
in if blen <= n
then b
else go blen (reverse (fromRawFilePath b))
where
go blen f = case uncons f of
Just (c, f')
| isAscii c ->
let blen' = blen - 1
in if blen' <= n
then toRawFilePath (reverse f')
else go blen' f'
| otherwise ->
let blen' = S.length (toRawFilePath f')
in if blen' <= n
then toRawFilePath (reverse f')
else go blen' f'
Nothing -> toRawFilePath (reverse f)
#else
{- On Windows, count the number of bytes used by each utf8 character. -}
truncateFilePath n = toRawFilePath . reverse . go [] n
where
go coll cnt bs
| cnt <= 0 = coll
| otherwise = case S8.decode bs of
Just (c, x)
| c /= S8.replacement_char ->
let x' = fromIntegral x
in if cnt - x' < 0
then coll
else go (c:coll) (cnt - x') (S8.drop 1 bs)
| otherwise ->
go ('_':coll) (cnt - 1) (S8.drop 1 bs)
_ -> coll
#endif
|