1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
{-# LANGUAGE Trustworthy #-}
{-# LANGUAGE NoImplicitPrelude #-}
-----------------------------------------------------------------------------
-- |
-- Module : GHC.IO.Encoding.Failure
-- Copyright : (c) The University of Glasgow, 2008-2011
-- License : see libraries/base/LICENSE
--
-- Maintainer : libraries@haskell.org
-- Stability : internal
-- Portability : non-portable
--
-- Types for specifying how text encoding/decoding fails
--
-----------------------------------------------------------------------------
module GHC.IO.Encoding.Failure (
CodingFailureMode(..), codingFailureModeSuffix,
isSurrogate,
recoverDecode, recoverEncode
) where
import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Exception
import GHC.Base
import GHC.Char
import GHC.Word
import GHC.Show
import GHC.Num
import GHC.Real ( fromIntegral )
--import System.Posix.Internals
-- | The 'CodingFailureMode' is used to construct 'TextEncoding's, and
-- specifies how they handle illegal sequences.
data CodingFailureMode
= ErrorOnCodingFailure
-- ^ Throw an error when an illegal sequence is encountered
| IgnoreCodingFailure
-- ^ Attempt to ignore and recover if an illegal sequence is
-- encountered
| TransliterateCodingFailure
-- ^ Replace with the closest visual match upon an illegal
-- sequence
| RoundtripFailure
-- ^ Use the private-use escape mechanism to attempt to allow
-- illegal sequences to be roundtripped.
deriving (Show)
-- This will only work properly for those encodings which are
-- strict supersets of ASCII in the sense that valid ASCII data
-- is also valid in that encoding. This is not true for
-- e.g. UTF-16, because ASCII characters must be padded to two
-- bytes to retain their meaning.
-- Note [Roundtripping]
-- ~~~~~~~~~~~~~~~~~~~~
--
-- Roundtripping is based on the ideas of PEP383.
--
-- We used to use the range of private-use characters from 0xEF80 to
-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery
-- to encode these characters.
--
-- However, people didn't like this because it means we don't get
-- guaranteed roundtripping for byte sequences that look like a UTF-8
-- encoded codepoint 0xEFxx.
--
-- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape
-- undecodable bytes, even though that may confuse Unicode processing
-- software written in Haskell. This guarantees roundtripping because
-- unicode input that includes lone surrogate codepoints is invalid by
-- definition.
--
--
-- When we used private-use characters there was a technical problem when it
-- came to encoding back to bytes using iconv. The iconv code will not fail when
-- it tries to encode a private-use character (as it would if trying to encode
-- a surrogate), which means that we wouldn't get a chance to replace it
-- with the byte we originally escaped.
--
-- To work around this, when filling the buffer to be encoded (in
-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the
-- private-use characters with lone surrogates again! Likewise, when
-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we had
-- to do the inverse process.
--
-- The user of String would never see these lone surrogates, but it
-- ensured that iconv will throw an error when encountering them. We
-- used lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
codingFailureModeSuffix :: CodingFailureMode -> String
codingFailureModeSuffix ErrorOnCodingFailure = ""
codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE"
codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP"
-- | In transliterate mode, we use this character when decoding
-- unknown bytes.
--
-- This is the defined Unicode replacement character:
-- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
unrepresentableChar :: Char
unrepresentableChar = '\xFFFD'
-- It is extraordinarily important that this series of
-- predicates/transformers gets inlined, because they tend to be used
-- in inner loops related to text encoding. In particular,
-- surrogatifyRoundtripCharacter must be inlined (see #5536)
-- | Some characters are actually "surrogate" codepoints defined for
-- use in UTF-16. We need to signal an invalid character if we detect
-- them when encoding a sequence of 'Char's into 'Word8's because they
-- won't give valid Unicode.
--
-- We may also need to signal an invalid character if we detect them
-- when encoding a sequence of 'Char's into 'Word8's because the
-- 'RoundtripFailure' mode creates these to round-trip bytes through
-- our internal UTF-16 encoding.
{-# INLINE isSurrogate #-}
isSurrogate :: Char -> Bool
isSurrogate c = (0xD800 <= x && x <= 0xDBFF)
|| (0xDC00 <= x && x <= 0xDFFF)
where x = ord c
-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
{-# INLINE escapeToRoundtripCharacterSurrogate #-}
escapeToRoundtripCharacterSurrogate :: Word8 -> Char
escapeToRoundtripCharacterSurrogate b
| b < 128 = chr (fromIntegral b)
-- Disallow 'smuggling' of ASCII bytes. For roundtripping to
-- work, this assumes encoding is ASCII-superset.
| otherwise = chr (0xDC00 + fromIntegral b)
-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
{-# INLINE unescapeRoundtripCharacterSurrogate #-}
unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
unescapeRoundtripCharacterSurrogate c
| 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
| otherwise = Nothing
where x = ord c
recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char
-> IO (Buffer Word8, Buffer Char)
recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
--puts $ "recoverDecode " ++ show ir
case cfm of
ErrorOnCodingFailure -> ioe_decodingError
IgnoreCodingFailure -> return (input { bufL=ir+1 }, output)
TransliterateCodingFailure -> do
ow' <- writeCharBuf oraw ow unrepresentableChar
return (input { bufL=ir+1 }, output { bufR=ow' })
RoundtripFailure -> do
b <- readWord8Buf iraw ir
ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
return (input { bufL=ir+1 }, output { bufR=ow' })
recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8
-> IO (Buffer Char, Buffer Word8)
recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ }
output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do
(c,ir') <- readCharBuf iraw ir
--puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
case cfm of
IgnoreCodingFailure -> return (input { bufL=ir' }, output)
TransliterateCodingFailure -> do
if c == '?'
then return (input { bufL=ir' }, output)
else do
-- XXX: evil hack! To implement transliteration, we just
-- poke an ASCII ? into the input buffer and tell the caller
-- to try and decode again. This is *probably* safe given
-- current uses of TextEncoding.
--
-- The "if" test above ensures we skip if the encoding fails
-- to deal with the ?, though this should never happen in
-- practice as all encodings are in fact capable of
-- reperesenting all ASCII characters.
_ir' <- writeCharBuf iraw ir '?'
return (input, output)
-- This implementation does not work because e.g. UTF-16
-- requires 2 bytes to encode a simple ASCII value
--writeWord8Buf oraw ow unrepresentableByte
--return (input { bufL=ir' }, output { bufR=ow+1 })
RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
writeWord8Buf oraw ow x
return (input { bufL=ir' }, output { bufR=ow+1 })
_ -> ioe_encodingError
ioe_decodingError :: IO a
ioe_decodingError = ioException
(IOError Nothing InvalidArgument "recoverDecode"
"invalid byte sequence" Nothing Nothing)
ioe_encodingError :: IO a
ioe_encodingError = ioException
(IOError Nothing InvalidArgument "recoverEncode"
"invalid character" Nothing Nothing)
|