1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
|
{-# LANGUAGE FlexibleContexts #-}
module Data.Encoding
(module Data.Encoding.Exception
,module Data.Encoding.ByteSource
,module Data.Encoding.ByteSink
,Encoding(..)
,DynEncoding
,recode
,encodeString
,encodeStringExplicit
,decodeString
,decodeStringExplicit
,encodeLazyByteString
,encodeLazyByteStringExplicit
,decodeLazyByteString
,decodeLazyByteStringExplicit
,encodeStrictByteString
,encodeStrictByteStringExplicit
,decodeStrictByteString
,decodeStrictByteStringExplicit
,encodingFromString
,encodingFromStringExplicit
)
where
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.Sequence
import Data.Foldable(toList)
import Data.Char
import Control.Monad.State
import Control.Monad.Identity
import Control.Monad.Error.Class
import Data.Binary.Put
import Data.Binary.Get
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Data.Encoding.ASCII
import Data.Encoding.UTF8
import Data.Encoding.UTF16
import Data.Encoding.UTF32
import Data.Encoding.ISO88591
import Data.Encoding.ISO88592
import Data.Encoding.ISO88593
import Data.Encoding.ISO88594
import Data.Encoding.ISO88595
import Data.Encoding.ISO88596
import Data.Encoding.ISO88597
import Data.Encoding.ISO88598
import Data.Encoding.ISO88599
import Data.Encoding.ISO885910
import Data.Encoding.ISO885911
import Data.Encoding.ISO885913
import Data.Encoding.ISO885914
import Data.Encoding.ISO885915
import Data.Encoding.ISO885916
import Data.Encoding.CP1250
import Data.Encoding.CP1251
import Data.Encoding.CP1252
import Data.Encoding.CP1253
import Data.Encoding.CP1254
import Data.Encoding.CP1255
import Data.Encoding.CP1256
import Data.Encoding.CP1257
import Data.Encoding.CP1258
import Data.Encoding.KOI8R
import Data.Encoding.KOI8U
import Data.Encoding.GB18030
import Data.Encoding.MacOSRoman
import Data.Encoding.JISX0201
import Data.Encoding.JISX0208
import Data.Encoding.ISO2022JP
import Data.Encoding.ShiftJIS
import Data.Encoding.CP437
import Data.Encoding.CP737
import Data.Encoding.CP775
import Data.Encoding.CP850
import Data.Encoding.CP852
import Data.Encoding.CP855
import Data.Encoding.CP857
import Data.Encoding.CP860
import Data.Encoding.CP861
import Data.Encoding.CP862
import Data.Encoding.CP863
import Data.Encoding.CP864
import Data.Encoding.CP865
import Data.Encoding.CP866
import Data.Encoding.CP869
import Data.Encoding.CP874
import Data.Encoding.CP932
import Data.Char
import Text.Regex
recode :: (Encoding enc1,Encoding enc2,ByteSource m,ByteSink m) => enc1 -> enc2 -> m ()
recode e1 e2 = untilM_ sourceEmpty (decodeChar e1 >>= encodeChar e2)
encodeString :: Encoding enc => enc -> String -> String
encodeString e str = toList $ viewl $ execState (encode e str) empty
encodeStringExplicit :: Encoding enc => enc -> String -> Either EncodingException String
encodeStringExplicit e str = execStateT (encode e str) empty >>= return.toList.viewl
decodeString :: Encoding enc => enc -> String -> String
decodeString e str = evalState (decode e) str
decodeStringExplicit :: Encoding enc => enc -> String -> Either DecodingException String
decodeStringExplicit e str = evalStateT (decode e) str
encodeLazyByteString :: Encoding enc => enc -> String -> LBS.ByteString
encodeLazyByteString e str = runPut $ encode e str
encodeLazyByteStringExplicit :: Encoding enc => enc -> String -> Either EncodingException LBS.ByteString
encodeLazyByteStringExplicit e str = let PutME g = encode e str
in case g of
Left err -> Left err
Right (p,()) -> Right $ runPut p
decodeLazyByteString :: Encoding enc => enc -> LBS.ByteString -> String
decodeLazyByteString e str = runGet (decode e) str
decodeLazyByteStringExplicit :: Encoding enc => enc -> LBS.ByteString -> Either DecodingException String
decodeLazyByteStringExplicit e str = evalStateT (decode e) str
encodeStrictByteString :: Encoding enc => enc -> String -> BS.ByteString
encodeStrictByteString e str = snd $ createStrict $ encode e str
encodeStrictByteStringExplicit :: Encoding enc => enc -> String -> Either EncodingException BS.ByteString
encodeStrictByteStringExplicit e str = let StrictSinkE g = encode e str
(r,bstr) = createStrict g
in case r of
Left err -> Left err
Right _ -> Right bstr
decodeStrictByteString :: Encoding enc => enc -> BS.ByteString -> String
decodeStrictByteString e str = evalState (decode e) str
decodeStrictByteStringExplicit :: Encoding enc => enc -> BS.ByteString -> Either DecodingException String
decodeStrictByteStringExplicit e str = evalStateT (decode e) str
-- | Like 'encodingFromString' but returns 'Nothing' instead of throwing an error
encodingFromStringExplicit :: String -> Maybe DynEncoding
encodingFromStringExplicit codeName = case (normalizeEncoding codeName) of
-- ASCII
"ascii" -> Just $ DynEncoding ASCII
"646" -> Just $ DynEncoding ASCII
"ansi_x3_4_1968" -> Just $ DynEncoding ASCII
"ansi_x3.4_1986" -> Just $ DynEncoding ASCII
"cp367" -> Just $ DynEncoding ASCII
"csascii" -> Just $ DynEncoding ASCII
"ibm367" -> Just $ DynEncoding ASCII
"iso646_us" -> Just $ DynEncoding ASCII
"iso_646.irv_1991" -> Just $ DynEncoding ASCII
"iso_ir_6" -> Just $ DynEncoding ASCII
"us" -> Just $ DynEncoding ASCII
"us_ascii" -> Just $ DynEncoding ASCII
-- UTF-8
"utf_8" -> Just $ DynEncoding UTF8
"u8" -> Just $ DynEncoding UTF8
"utf" -> Just $ DynEncoding UTF8
"utf8" -> Just $ DynEncoding UTF8
"utf8_ucs2" -> Just $ DynEncoding UTF8
"utf8_ucs4" -> Just $ DynEncoding UTF8
-- UTF-16
"utf_16" -> Just $ DynEncoding UTF16
"u16" -> Just $ DynEncoding UTF16
"utf16" -> Just $ DynEncoding UTF16
-- UTF-32
"utf_32" -> Just $ DynEncoding UTF32
-- KOI8-R
"koi8_r" -> Just $ DynEncoding KOI8R
"cskoi8r" -> Just $ DynEncoding KOI8R
-- KOI8-I
"koi8_u" -> Just $ DynEncoding KOI8U
-- ISO-8859-1
"iso_8859_1" -> Just $ DynEncoding ISO88591
"iso8859_1" -> Just $ DynEncoding ISO88591
"8859" -> Just $ DynEncoding ISO88591
"cp819" -> Just $ DynEncoding ISO88591
"csisolatin1" -> Just $ DynEncoding ISO88591
"ibm819" -> Just $ DynEncoding ISO88591
"iso8859" -> Just $ DynEncoding ISO88591
"iso_8859_1_1987" -> Just $ DynEncoding ISO88591
"iso_ir_100" -> Just $ DynEncoding ISO88591
"l1" -> Just $ DynEncoding ISO88591
"latin" -> Just $ DynEncoding ISO88591
"latin1" -> Just $ DynEncoding ISO88591
-- ISO-8859-2
"iso_8859_2" -> Just $ DynEncoding ISO88592
"iso8859_2" -> Just $ DynEncoding ISO88592
"csisolatin2" -> Just $ DynEncoding ISO88592
"iso_8859_2_1987" -> Just $ DynEncoding ISO88592
"iso_ir_101" -> Just $ DynEncoding ISO88592
"l2" -> Just $ DynEncoding ISO88592
"latin2" -> Just $ DynEncoding ISO88592
-- ISO-8859-3
"iso_8859_3" -> Just $ DynEncoding ISO88593
"iso8859_3" -> Just $ DynEncoding ISO88593
"csisolatin3" -> Just $ DynEncoding ISO88593
"iso_8859_3_1988" -> Just $ DynEncoding ISO88593
"iso_ir_109" -> Just $ DynEncoding ISO88593
"l3" -> Just $ DynEncoding ISO88593
"latin3" -> Just $ DynEncoding ISO88593
--ISO-8859-4
"iso_8859_4" -> Just $ DynEncoding ISO88594
"iso8859_4" -> Just $ DynEncoding ISO88594
"csisolatin4" -> Just $ DynEncoding ISO88594
"iso_8859_4_1988" -> Just $ DynEncoding ISO88594
"iso_ir_110" -> Just $ DynEncoding ISO88594
"l4" -> Just $ DynEncoding ISO88594
"latin4" -> Just $ DynEncoding ISO88594
--ISO-8859-5
"iso_8859_5" -> Just $ DynEncoding ISO88595
"iso8859_5" -> Just $ DynEncoding ISO88595
"csisolatincyrillic" -> Just $ DynEncoding ISO88595
"cyrillic" -> Just $ DynEncoding ISO88595
"iso_8859_5_1988" -> Just $ DynEncoding ISO88595
"iso_ir_144" -> Just $ DynEncoding ISO88595
-- ISO-8859-6
"iso_8859_6" -> Just $ DynEncoding ISO88596
"iso8859_6" -> Just $ DynEncoding ISO88596
"arabic" -> Just $ DynEncoding ISO88596
"asmo_708" -> Just $ DynEncoding ISO88596
"csisolatinarabic" -> Just $ DynEncoding ISO88596
"ecma_114" -> Just $ DynEncoding ISO88596
"iso_8859_6_1987" -> Just $ DynEncoding ISO88596
"iso_ir_127" -> Just $ DynEncoding ISO88596
-- ISO-8859-7
"iso_8859_7" -> Just $ DynEncoding ISO88597
"iso8859_7" -> Just $ DynEncoding ISO88597
"csisolatingreek" -> Just $ DynEncoding ISO88597
"ecma_118" -> Just $ DynEncoding ISO88597
"elot_928" -> Just $ DynEncoding ISO88597
"greek" -> Just $ DynEncoding ISO88597
"greek8" -> Just $ DynEncoding ISO88597
"iso_8859_7_1987" -> Just $ DynEncoding ISO88597
"iso_ir_126" -> Just $ DynEncoding ISO88597
-- ISO-8859-8
"iso_8859_8" -> Just $ DynEncoding ISO88598
"iso8859_8" -> Just $ DynEncoding ISO88598
"csisolatinhebrew" -> Just $ DynEncoding ISO88598
"hebrew" -> Just $ DynEncoding ISO88598
"iso_8859_8_1988" -> Just $ DynEncoding ISO88598
"iso_ir_138" -> Just $ DynEncoding ISO88598
-- ISO-8859-9
"iso_8859_9" -> Just $ DynEncoding ISO88599
"iso8859_9" -> Just $ DynEncoding ISO88599
"csisolatin5" -> Just $ DynEncoding ISO88599
"iso_8859_9_1989" -> Just $ DynEncoding ISO88599
"iso_ir_148" -> Just $ DynEncoding ISO88599
"l5" -> Just $ DynEncoding ISO88599
"latin5" -> Just $ DynEncoding ISO88599
-- ISO-8859-10
"iso_8859_10" -> Just $ DynEncoding ISO885910
"iso8859_10" -> Just $ DynEncoding ISO885910
"csisolatin6" -> Just $ DynEncoding ISO885910
"iso_8859_10_1992" -> Just $ DynEncoding ISO885910
"iso_ir_157" -> Just $ DynEncoding ISO885910
"l6" -> Just $ DynEncoding ISO885910
"latin6" -> Just $ DynEncoding ISO885910
-- ISO-8859-11
"iso_8859_11" -> Just $ DynEncoding ISO885911
"iso8859_11" -> Just $ DynEncoding ISO885911
"thai" -> Just $ DynEncoding ISO885911
"iso_8859_11_2001" -> Just $ DynEncoding ISO885911
-- ISO-8859-13
"iso_8859_13" -> Just $ DynEncoding ISO885913
"iso8859_13" -> Just $ DynEncoding ISO885913
-- ISO-8859-14
"iso_8859_14" -> Just $ DynEncoding ISO885914
"iso8859_14" -> Just $ DynEncoding ISO885914
"iso_8859_14_1998" -> Just $ DynEncoding ISO885914
"iso_celtic" -> Just $ DynEncoding ISO885914
"iso_ir_199" -> Just $ DynEncoding ISO885914
"l8" -> Just $ DynEncoding ISO885914
"latin8" -> Just $ DynEncoding ISO885914
-- ISO-8859-15
"iso_8859_15" -> Just $ DynEncoding ISO885915
"iso8859_15" -> Just $ DynEncoding ISO885915
"latin9" -> Just $ DynEncoding ISO885915
"l9" -> Just $ DynEncoding ISO885915
-- ISO-8859-16
"iso_8859_16" -> Just $ DynEncoding ISO885916
"iso8859_16" -> Just $ DynEncoding ISO885916
"iso_8859_16_2001" -> Just $ DynEncoding ISO885916
"iso_ir_226" -> Just $ DynEncoding ISO885916
"l10" -> Just $ DynEncoding ISO885916
"latin10" -> Just $ DynEncoding ISO885916
-- CP1250
"cp1250" -> Just $ DynEncoding CP1250
"windows_1250" -> Just $ DynEncoding CP1250
-- CP1251
"cp1251" -> Just $ DynEncoding CP1251
"windows_1251" -> Just $ DynEncoding CP1251
-- CP1252
"cp1252" -> Just $ DynEncoding CP1252
"windows_1252" -> Just $ DynEncoding CP1252
-- CP1253
"cp1253" -> Just $ DynEncoding CP1253
"windows_1253" -> Just $ DynEncoding CP1253
-- CP1254
"cp1254" -> Just $ DynEncoding CP1254
"windows_1254" -> Just $ DynEncoding CP1254
-- CP1255
"cp1255" -> Just $ DynEncoding CP1255
"windows_1255" -> Just $ DynEncoding CP1255
-- CP1256
"cp1256" -> Just $ DynEncoding CP1256
"windows_1256" -> Just $ DynEncoding CP1256
-- CP1257
"cp1257" -> Just $ DynEncoding CP1257
"windows_1257" -> Just $ DynEncoding CP1257
-- CP1258
"cp1258" -> Just $ DynEncoding CP1258
"windows_1258" -> Just $ DynEncoding CP1258
-- GB18030
"gb18030" -> Just $ DynEncoding GB18030
"gb18030_2000" -> Just $ DynEncoding GB18030
-- MacOSRoman
"macintosh" -> Just $ DynEncoding MacOSRoman
-- JIS X 0201
"jis_x_0201" -> Just $ DynEncoding JISX0201
-- JIS X 0208
"jis_x_0208" -> Just $ DynEncoding JISX0208
-- ISO 2022-JP
"iso_2022_jp" -> Just $ DynEncoding ISO2022JP
-- Shift JIS
"shift_jis" -> Just $ DynEncoding ShiftJIS
"sjis" -> Just $ DynEncoding ShiftJIS
-- MSDOS codepages
"cp437" -> Just $ DynEncoding CP437
"cp737" -> Just $ DynEncoding CP737
"cp775" -> Just $ DynEncoding CP775
"cp850" -> Just $ DynEncoding CP850
"cp852" -> Just $ DynEncoding CP852
"cp855" -> Just $ DynEncoding CP855
"cp857" -> Just $ DynEncoding CP857
"cp860" -> Just $ DynEncoding CP860
"cp861" -> Just $ DynEncoding CP861
"cp862" -> Just $ DynEncoding CP862
"cp863" -> Just $ DynEncoding CP863
"cp864" -> Just $ DynEncoding CP864
"cp865" -> Just $ DynEncoding CP865
"cp866" -> Just $ DynEncoding CP866
"cp869" -> Just $ DynEncoding CP869
"cp874" -> Just $ DynEncoding CP874
"cp932" -> Just $ DynEncoding CP932
-- defaults to nothing
_ -> Nothing
where
normalizeEncoding s = map toLower $ subRegex sep s "_"
sep = mkRegex "[^0-9A-Za-z]+"
-- | Takes the name of an encoding and creates a dynamic encoding from it.
encodingFromString :: String -> DynEncoding
encodingFromString str = maybe
(error $ "Data.Encoding.encodingFromString: Unknown encoding: "++show str)
id
(encodingFromStringExplicit str)
|