File: IConv.hs

package info (click to toggle)
haskell-iconv 0.4.1.0-2
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 108 kB
  • sloc: haskell: 510; ansic: 18; makefile: 2
file content (471 lines) | stat: -rw-r--r-- 19,933 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
{-# LANGUAGE CPP #-}
-----------------------------------------------------------------------------
-- |
-- Copyright   :  (c) 2006-2007 Duncan Coutts
-- License     :  BSD-style
--
-- Maintainer  :  duncan@haskell.org
-- Portability :  portable (H98 + FFI)
--
-- String encoding conversion
--
-----------------------------------------------------------------------------
module Codec.Text.IConv (

  -- | This module provides pure functions for converting the string encoding
  -- of strings represented by lazy 'ByteString's. This makes it easy to use
  -- either in memory or with disk or network IO.
  --
  -- For example, a simple Latin1 to UTF-8 conversion program is just:
  --
  -- > import Codec.Text.IConv as IConv
  -- > import Data.ByteString.Lazy as ByteString
  -- >
  -- > main = ByteString.interact (convert "LATIN1" "UTF-8")
  --
  -- Or you could lazily read in and convert a UTF-8 file to UTF-32 using:
  --
  -- > content <- fmap (IConv.convert "UTF-8" "UTF-32") (readFile file)
  --
  -- This module uses the POSIX @iconv()@ library function. The primary
  -- advantage of using iconv is that it is widely available, most systems
  -- have a wide range of supported string encodings and the conversion speed
  -- it typically good. The iconv library is available on all unix systems
  -- (since it is required by the POSIX.1 standard) and GNU libiconv is
  -- available as a standalone library for other systems, including Windows.

  -- * Simple conversion API
  convert,
  EncodingName,

  -- * Variant that is lax about conversion errors
  convertFuzzy,
  Fuzzy(..),

  -- * Variants that are pedantic about conversion errors
  convertStrictly,
  convertLazily,
  ConversionError(..),
  reportConversionError,
  Span(..),

  ) where

import Prelude hiding (length, span)

import Control.Exception (assert)
import qualified Control.Exception as Exception
import Foreign.C.Error as C.Error (Errno, errnoToIOError)

import qualified Data.ByteString.Lazy as L (ByteString, toChunks, fromChunks)
import qualified Data.ByteString.Lazy.Internal as L (defaultChunkSize)
import qualified Data.ByteString as S

import qualified Codec.Text.IConv.Internal as IConv
import Codec.Text.IConv.Internal (IConv)


-- | A string encoding name, eg @\"UTF-8\"@ or @\"LATIN1\"@.
--
-- The range of string encodings available is determined by the capabilities
-- of the underlying iconv implementation.
--
-- When using the GNU C or libiconv libraries, the permitted values are listed
-- by the @iconv --list@ command, and all combinations of the listed values
-- are supported.
--
type EncodingName = String

-- | Output spans from encoding conversion. When nothing goes wrong we
-- expect just a bunch of 'Span's. If there are conversion errors we get other
-- span types.
--
data Span =

    -- | An ordinary output span in the target encoding
    Span !S.ByteString

    -- | An error in the conversion process. If this occurs it will be the
    -- last span.
  | ConversionError !ConversionError

data ConversionError =
    -- | The conversion from the input to output string encoding is not
    -- supported by the underlying iconv implementation. This is usually
    -- because a named encoding is not recognised or support for it
    -- was not enabled on this system.
    --
    -- The POSIX standard does not guarantee that all possible combinations
    -- of recognised string encoding are supported, however most common
    -- implementations do support all possible combinations.
    --
    UnsuportedConversion EncodingName EncodingName

    -- | This covers two possible conversion errors:
    --
    -- * There is a byte sequence in the input that is not valid in the input
    -- encoding.
    --
    -- * There is a valid character in the input that has no corresponding
    -- character in the output encoding.
    --
    -- Unfortunately iconv does not let us distinguish these two cases. In
    -- either case, the Int parameter gives the byte offset in the input of
    -- the unrecognised bytes or unconvertable character.
    --
  | InvalidChar Int

    -- | This error covers the case where the end of the input has trailing
    -- bytes that are the initial bytes of a valid character in the input
    -- encoding. In other words, it looks like the input ended in the middle of
    -- a multi-byte character. This would often be an indication that the input
    -- was somehow truncated. Again, the Int parameter is the byte offset in
    -- the input where the incomplete character starts.
    --
  | IncompleteChar Int

    -- | An unexpected iconv error. The iconv spec lists a number of possible
    -- expected errors but does not guarantee that there might not be other
    -- errors.
    --
    -- This error can occur either immediately, which might indicate that the
    -- iconv installation is messed up somehow, or it could occur later which
    -- might indicate resource exhaustion or some other internal iconv error.
    --
    -- Use 'Foreign.C.Error.errnoToIOError' to get slightly more information
    -- on what the error could possibly be.
    --
  | UnexpectedError C.Error.Errno

reportConversionError :: ConversionError -> IOError
reportConversionError conversionError = case conversionError of
  UnsuportedConversion fromEncoding toEncoding
                          -> err $ "cannot convert from string encoding "
                             ++ show fromEncoding ++ " to string encoding "
                             ++ show toEncoding
  InvalidChar    inputPos -> err $ "invalid input sequence at byte offset "
                               ++ show inputPos
  IncompleteChar inputPos -> err $ "incomplete input sequence at byte offset "
                               ++ show inputPos
  UnexpectedError errno   -> C.Error.errnoToIOError
                               "Codec.Text.IConv: unexpected error" errno
                               Nothing Nothing
  where err msg = userError $ "Codec.Text.IConv: " ++ msg

{-# NOINLINE convert #-}
-- | Convert text from one named string encoding to another.
--
-- * The conversion is done lazily.
--
-- * An exception is thrown if conversion between the two encodings is not
-- supported.
--
-- * An exception is thrown if there are any encoding conversion errors.
--
convert :: EncodingName      -- ^ Name of input string encoding
        -> EncodingName      -- ^ Name of output string encoding
        -> L.ByteString      -- ^ Input text
        -> L.ByteString      -- ^ Output text
convert fromEncoding toEncoding =

    -- lazily convert the list of spans into an ordinary lazy ByteString:
    L.fromChunks
  . foldr span []
  . convertLazily fromEncoding toEncoding

  where
    span (Span c)            cs = c : cs
    span (ConversionError e) _  =
#if MIN_VERSION_base(4,0,0)
      Exception.throw (reportConversionError e)
#else
      Exception.throw (Exception.IOException (reportConversionError e))
#endif


data Fuzzy = Transliterate | Discard

-- | Convert text ignoring encoding conversion problems.
--
-- If invalid byte sequences are found in the input they are ignored and
-- conversion continues if possible. This is not always possible especially
-- with stateful encodings. No placeholder character is inserted into the
-- output so there will be no indication that invalid byte sequences were
-- encountered.
--
-- If there are characters in the input that have no direct corresponding
-- character in the output encoding then they are dealt in one of two ways,
-- depending on the 'Fuzzy' argument. We can try and 'Transliterate' them into
-- the nearest corresponding character(s) or use a replacement character
-- (typically @\'?\'@ or the Unicode replacement character). Alternatively they
-- can simply be 'Discard'ed.
--
-- In either case, no exceptions will occur. In the case of unrecoverable
-- errors, the output will simply be truncated. This includes the case of
-- unrecognised or unsupported encoding names; the output will be empty.
--
-- * This function only works with the GNU iconv implementation which provides
-- this feature beyond what is required by the iconv specification.
--
convertFuzzy :: Fuzzy           -- ^ Whether to try and transliterate or
                                -- discard characters with no direct conversion
             -> EncodingName    -- ^ Name of input string encoding
             -> EncodingName    -- ^ Name of output string encoding
             -> L.ByteString    -- ^ Input text
             -> L.ByteString    -- ^ Output text
convertFuzzy fuzzy fromEncoding toEncoding =

    -- lazily convert the list of spans into an ordinary lazy ByteString:
    L.fromChunks
  . foldr span []
  . convertInternal IgnoreInvalidChar fromEncoding (toEncoding ++ mode)
  where
    mode = case fuzzy of
             Transliterate -> "//IGNORE,TRANSLIT"
             Discard       -> "//IGNORE"
    span (Span c)            cs = c : cs
    span (ConversionError _) cs = cs

{-# NOINLINE convertStrictly #-}
-- | This variant does the conversion all in one go, so it is able to report
-- any conversion errors up front. It exposes all the possible error conditions
-- and never throws exceptions
--
-- The disadvantage is that no output can be produced before the whole input
-- is consumed. This might be problematic for very large inputs.
--
convertStrictly :: EncodingName      -- ^ Name of input string encoding
                -> EncodingName      -- ^ Name of output string encoding
                -> L.ByteString      -- ^ Input text
                -> Either L.ByteString
                          ConversionError -- ^ Output text or conversion error
convertStrictly fromEncoding toEncoding =
    -- strictly convert the list of spans into an ordinary lazy ByteString
    -- or an error
    strictify []
  . convertLazily fromEncoding toEncoding

  where
    strictify :: [S.ByteString] -> [Span] -> Either L.ByteString ConversionError
    strictify cs []                    = Left (L.fromChunks (reverse cs))
    strictify cs (Span c : ss)         = strictify (c:cs) ss
    strictify _  (ConversionError e:_) = Right e


{-# NOINLINE convertLazily #-}
-- | This version provides a more complete but less convenient conversion
-- interface. It exposes all the possible error conditions and never throws
-- exceptions.
--
-- The conversion is still lazy. It returns a list of spans, where a span may
-- be an ordinary span of output text or a conversion error. This somewhat
-- complex interface allows both for lazy conversion and for precise reporting
-- of conversion problems. The other functions 'convert' and 'convertStrictly'
-- are actually simple wrappers on this function.
--
convertLazily :: EncodingName  -- ^ Name of input string encoding
              -> EncodingName  -- ^ Name of output string encoding
              -> L.ByteString  -- ^ Input text
              -> [Span]        -- ^ Output text spans
convertLazily = convertInternal StopOnInvalidChar


data InvalidCharBehaviour = StopOnInvalidChar | IgnoreInvalidChar

convertInternal :: InvalidCharBehaviour
                -> EncodingName -> EncodingName
                -> L.ByteString -> [Span]
convertInternal ignore fromEncoding toEncoding input =
  IConv.run fromEncoding toEncoding $ \status -> case status of
    IConv.InitOk -> do IConv.newOutputBuffer outChunkSize
                       fillInputBuffer ignore (L.toChunks input)

    IConv.UnsupportedConversion     -> failConversion (UnsuportedConversion
                                                         fromEncoding
                                                         toEncoding)
    IConv.UnexpectedInitError errno -> failConversion (UnexpectedError errno)


fillInputBuffer :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
fillInputBuffer ignore (inChunk : inChunks) = do
  IConv.pushInputBuffer inChunk
  drainBuffers ignore inChunks

fillInputBuffer _ignore [] = do
  outputBufferBytesAvailable <- IConv.outputBufferBytesAvailable
  IConv.finalise
  if outputBufferBytesAvailable > 0
    then do outChunk <- IConv.popOutputBuffer
            return [Span outChunk]
    else return []


drainBuffers :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
drainBuffers ignore inChunks = do
  inputBufferEmpty_ <- IConv.inputBufferEmpty
  outputBufferFull <- IConv.outputBufferFull
  assert (not outputBufferFull && not inputBufferEmpty_) $ return ()
  -- this invariant guarantees we can always make forward progress

  status <- IConv.iconv

  case status of
    IConv.InputEmpty -> do
      inputBufferEmpty <- IConv.inputBufferEmpty
      assert inputBufferEmpty $ fillInputBuffer ignore inChunks

    IConv.OutputFull -> do
      outChunk <- IConv.popOutputBuffer
      outChunks <- IConv.unsafeInterleave $ do
        IConv.newOutputBuffer outChunkSize
        drainBuffers ignore inChunks
      return (Span outChunk : outChunks)

    IConv.InvalidChar -> invalidChar ignore inChunks

    IConv.IncompleteChar -> fixupBoundary ignore inChunks

    IConv.UnexpectedError errno -> failConversion (UnexpectedError errno)

-- | The posix iconv api looks like it's designed specifically for streaming
-- and it is, except for one really really annoying corner case...
--
-- Suppose you're converting a stream, say by reading a file in 4k chunks. This
-- would seem to be the canonical use case for iconv, reading and converting an
-- input file. However suppose the 4k read chunk happens to split a multi-byte
-- character. Then iconv will stop just before that char and tell us that its
-- an incomplete char. So far so good. Now what we'd like to do is have iconv
-- remember those last few bytes in its conversion state so we can carry on
-- with the next 4k block. Sadly it does not. It requires us to fix things up
-- so that it can carry on with the next block starting with a complete multi-
-- byte character. Do do that we have to somehow copy those few trailing bytes
-- to the beginning of the next block. That's perhaps not too bad in an
-- imperitive context using a mutable input buffer - we'd just copy the few
-- trailing bytes to the beginning of the buffer and do a short read (ie 4k-n
-- the number of trailing bytes). That's not terribly nice since it means the
-- OS has to do IO on non-page aligned buffers which tends to be slower. It's
-- worse for us though since we're not using a mutable input buffer, we're
-- using a lazy bytestring which is a sequence of immutable buffers.
--
-- So we have to do more cunning things. We could just prepend the trailing
-- bytes to the next block, but that would mean alocating and copying the whole
-- next block just to prepend a couple bytes. This probably happens quite
-- frequently so would be pretty slow. So we have to be even more cunning.
--
-- The solution is to create a very small buffer to cover the few bytes making
-- up the character spanning the block boundary. So we copy the trailing bytes
-- plus a few from the beginning of the next block. Then we run iconv again on
-- that small buffer. How many bytes from the next block to copy is a slightly
-- tricky issue. If we copy too few there's no guarantee that we have enough to
-- give a complete character. We opt for a maximum size of 16, 'tmpChunkSize'
-- on the theory that no encoding in existance uses that many bytes to encode a
-- single character, so it ought to be enough. Yeah, it's a tad dodgey.
--
-- Having papered over the block boundary, we still have to cross the boundary
-- of this small buffer. It looks like we've still got the same problem,
-- however this time we should have crossed over into bytes that are wholly
-- part of the large following block so we can abandon our small temp buffer
-- an continue with the following block, with a slight offset for the few bytes
-- taken up by the chars that fit into the small buffer.
--
-- So yeah, pretty complex. Check out the proof below of the tricky case.
--
fixupBoundary :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
fixupBoundary _ignore [] = do
  inputPos <- IConv.inputPosition
  failConversion (IncompleteChar inputPos)
fixupBoundary ignore inChunks@(inChunk : inChunks') = do
  inSize <- IConv.inputBufferSize
  assert (inSize < tmpChunkSize) $ return ()
  let extraBytes = tmpChunkSize - inSize

  if S.length inChunk <= extraBytes
    then do
      IConv.replaceInputBuffer (`S.append` inChunk)
      drainBuffers ignore inChunks'
    else do
      IConv.replaceInputBuffer (`S.append` S.take extraBytes inChunk)

      before <- IConv.inputBufferSize
      assert (before == tmpChunkSize) $ return ()

      status <- IConv.iconv
      after <- IConv.inputBufferSize
      let consumed = before - after

      case status of
        IConv.InputEmpty ->
          assert (consumed == tmpChunkSize) $
          fillInputBuffer ignore (S.drop extraBytes inChunk : inChunks')

        IConv.OutputFull -> do
          outChunk <- IConv.popOutputBuffer
          outChunks <- IConv.unsafeInterleave $ do
            IConv.newOutputBuffer outChunkSize
            drainBuffers ignore inChunks
          return (Span outChunk : outChunks)

        IConv.InvalidChar -> invalidChar ignore inChunks

        IConv.IncompleteChar ->
          assert (inSize < consumed && consumed < tmpChunkSize) $
          --    inSize < consumed < tmpChunkSize
          -- => { subtract inSize from each side }
          --    0 < consumed - inSize < tmpChunkSize - inSize
          -- => { by definition that extraBytes = tmpChunkSize - inSize }
          --    0 < consumed - inSize < extraBytes
          -- => { since we're in the False case of the if, we know:
          --        not (S.length inChunk <= extraBytes)
          --      =      S.length inChunk > extraBytes
          --      =      extraBytes < S.length inChunk }
          --    0 < consumed - inSize < extraBytes < S.length inChunk
          --
          -- And we're done! We know it's safe to drop (consumed - inSize) from
          -- inChunk since it's more than 0 and less than the inChunk size, so
          -- we're not being left with an empty chunk (which is not allowed).

          drainBuffers ignore (S.drop (consumed - inSize) inChunk : inChunks')

        IConv.UnexpectedError errno -> failConversion (UnexpectedError errno)


invalidChar :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span]
invalidChar StopOnInvalidChar _ = do
  inputPos <- IConv.inputPosition
  failConversion (InvalidChar inputPos)

invalidChar IgnoreInvalidChar inChunks = do
  inputPos  <- IConv.inputPosition
  let invalidCharError = ConversionError (InvalidChar inputPos)
  outputBufferBytesAvailable <- IConv.outputBufferBytesAvailable
  if outputBufferBytesAvailable > 0
    then do outChunk  <- IConv.popOutputBuffer
            outChunks <- IConv.unsafeInterleave $ do
              IConv.newOutputBuffer outChunkSize
              inputBufferEmpty <- IConv.inputBufferEmpty
              if inputBufferEmpty
                then fillInputBuffer IgnoreInvalidChar inChunks
                else drainBuffers    IgnoreInvalidChar inChunks
            return (Span outChunk : invalidCharError : outChunks)
    else do outChunks <- IConv.unsafeInterleave $ do
              IConv.newOutputBuffer outChunkSize
              inputBufferEmpty <- IConv.inputBufferEmpty
              if inputBufferEmpty
                then fillInputBuffer IgnoreInvalidChar inChunks
                else drainBuffers    IgnoreInvalidChar inChunks
            return (invalidCharError : outChunks)

failConversion :: ConversionError -> IConv [Span]
failConversion err = do
  outputBufferBytesAvailable <- IConv.outputBufferBytesAvailable
  IConv.finalise
  if outputBufferBytesAvailable > 0
    then do outChunk <- IConv.popOutputBuffer
            return [Span outChunk, ConversionError err]
    else    return [               ConversionError err]

outChunkSize :: Int
outChunkSize = L.defaultChunkSize

tmpChunkSize :: Int
tmpChunkSize = 16