1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
|
import Control.Monad
import System.IO
import Control.Exception
import Foreign.Marshal.Array
import Foreign.Ptr
import GHC.Foreign
import GHC.IO.Encoding (TextEncoding, mkTextEncoding)
import Data.Char
import Data.Word
decode :: TextEncoding -> [Word8] -> IO String
decode enc xs = withArrayLen xs (\sz p -> peekCStringLen enc (castPtr p, sz)) `catch` \e -> return (show (e :: IOException))
encode :: TextEncoding -> String -> IO [Word8]
encode enc cs = withCStringLen enc cs (\(p, sz) -> peekArray sz (castPtr p)) `catch` \e -> return (const [] (e :: IOException))
asc :: Char -> Word8
asc = fromIntegral . ord
families = [ ([asc 'H', asc 'i', 0xED, 0xB2, 0x80, asc '!'],
["UTF-8", "UTF-8//IGNORE", "UTF-8//TRANSLIT", "UTF-8//ROUNDTRIP"])
, ([asc 'H', 0, asc 'i', 0, 0xFF, 0xDF, 0xFF, 0xDF, asc '!', 0],
["UTF-16LE", "UTF-16LE//IGNORE", "UTF-16LE//TRANSLIT", "UTF-16LE//ROUNDTRIP"])
, ([0, asc 'H', 0, asc 'i', 0xDF, 0xFF, 0xDF, 0xFF, 0, asc '!'],
["UTF-16BE", "UTF-16BE//IGNORE", "UTF-16BE//TRANSLIT", "UTF-16BE//ROUNDTRIP"])
, ([asc 'H', 0, 0, 0, asc 'i', 0, 0, 0, 0xED, 0xB2, 0x80, 0, asc '!', 0, 0, 0],
["UTF-32LE", "UTF-32LE//IGNORE", "UTF-32LE//TRANSLIT", "UTF-32LE//ROUNDTRIP"])
, ([0, 0, 0, asc 'H', 0, 0, 0, asc 'i', 0, 0x80, 0xB2, 0xED, 0, 0, 0, asc '!'],
["UTF-32BE", "UTF-32BE//IGNORE", "UTF-32BE//TRANSLIT", "UTF-32BE//ROUNDTRIP"])
]
main = do
surrogate_enc <- mkTextEncoding "UTF-8//ROUNDTRIP"
-- Test that invalid input is correctly roundtripped as surrogates
-- This only works for the UTF-8 UTF since it is the only UTF which
-- is an ASCII superset.
putStrLn $ "== UTF-8: roundtripping"
let invalid_bytes = [asc 'H', asc 'i', 0xED, 0xB2, 0x80, asc '!']
surrogates <- decode surrogate_enc invalid_bytes
invalid_bytes' <- encode surrogate_enc surrogates
print invalid_bytes
print surrogates
print invalid_bytes'
print (invalid_bytes == invalid_bytes')
putStrLn ""
forM families $ \(invalid_bytes, enc_names) -> do
encs <- mapM mkTextEncoding enc_names
let name = head enc_names
-- How we deal with decoding errors in the various modes:
putStrLn $ "== " ++ name ++ ": decoding"
forM encs $ \enc -> decode enc invalid_bytes >>= print
-- How about encoding errors, particularly those from embedded surrogates?
putStrLn $ "== " ++ name ++ ": encoding"
forM encs $ \enc -> encode enc "Hi\xDC80!" >>= print
putStrLn ""
|