File: UTF8.hs

package info (click to toggle)
haskell-encoding 0.10.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,392 kB
  • sloc: haskell: 4,372; ansic: 11; makefile: 4
file content (130 lines) | stat: -rw-r--r-- 5,931 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
{-# LANGUAGE DeriveDataTypeable #-}
{- | This module implements UTF-8 encoding and decoding as in RFC 3629.
     See <http://en.wikipedia.org/wiki/UTF-8> for more information.
 -}
module Data.Encoding.UTF8 where

import Control.Throws
import Data.Char
import Data.Bits

import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception

import Data.Typeable

data UTF8 = UTF8        -- ^ Very forgiving decoding mechanism, accepts everything that it can make any sense of.
          | UTF8Strict  -- ^ More strict decoding, doesn\'t accept sequences that have a too long representation and checks bits that aren\'t used in the decoding
          deriving (Eq,Show,Typeable)

instance Encoding UTF8 where
    encodeChar _ c
               | n <= 0x0000007F = p8 n
               | n <= 0x000007FF = do
                         p8 $ 0xC0 .|. (n `shiftR` 6)
                         p8 $ 0x80 .|. (n .&. 0x3F)
               | n <= 0x0000FFFF = do
                         p8 $ 0xE0 .|. (n `shiftR` 12)
                         p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
                         p8 $ 0x80 .|. (n .&. 0x3F)
               | n <= 0x0010FFFF = do
                         p8 $ 0xF0 .|. (n `shiftR` 18)
                         p8 $ 0x80 .|. ((n `shiftR` 12) .&. 0x3F)
                         p8 $ 0x80 .|. ((n `shiftR` 6) .&. 0x3F)
                         p8 $ 0x80 .|. (n .&. 0x3F)
               | otherwise = throwException (HasNoRepresentation c)
               where
                 n = ord c
                 p8 = pushWord8.fromIntegral
    encodeable _ c = c <= '\x10FFFF'
    decodeChar UTF8 = do
      w1 <- fetchWord8
      case () of 
        _
          | w1 <= 0x7F -> return $ chr $ fromIntegral w1
          | w1 <= 0xBF -> throwException (IllegalCharacter w1)
          | w1 <= 0xDF -> do
                         w2 <- fetchWord8
                         return $ chr $
                                    ((fromIntegral $ w1 .&. 0x1F) `shiftL` 6)
                                    .|. (fromIntegral $ w2 .&. 0x3F)

          | w1 <= 0xEF -> do
                         w2 <- fetchWord8
                         w3 <- fetchWord8
                         let v1 = w1 .&. 0x0F
                             v2 = w2 .&. 0x3F
                             v3 = w3 .&. 0x3F
                         return $ chr $
                                    ((fromIntegral v1) `shiftL` 12)
                                    .|. ((fromIntegral v2) `shiftL` 6)
                                    .|. (fromIntegral v3)
          | w1 <= 0xF7 -> do
                         w2 <- fetchWord8
                         w3 <- fetchWord8
                         w4 <- fetchWord8
                         let v1 = w1 .&. 0x07
                             v2 = w2 .&. 0x3F
                             v3 = w3 .&. 0x3F
                             v4 = w4 .&. 0x3F
                             v  = ((fromIntegral v1) `shiftL` 18)
                                    .|. ((fromIntegral v2) `shiftL` 12)
                                    .|. ((fromIntegral v3) `shiftL` 6)
                                    .|. (fromIntegral v4)
                         if v <= 0x10FFFF
                           then return $ chr v
                           else throwException (IllegalRepresentation [w1,w2,w3,w4])
          | otherwise -> throwException (IllegalCharacter w1)
    decodeChar UTF8Strict = do
      w1 <- fetchWord8
      case () of 
        _
          | w1 <= 0x7F -> return $ chr $ fromIntegral w1
          | w1 <= 0xBF -> throwException (IllegalCharacter w1)
          | w1 <= 0xDF -> do
                         w2 <- fetchExtend8
                         let v1 = w1 .&. 0x1F
                         if v1 <= 1
                           then throwException (IllegalRepresentation [w1,w2])
                           else return $ chr $ 
                                    ((fromIntegral v1) `shiftL` 6)
                                    .|. (fromIntegral $ w2 .&. 0x3F)
          | w1 <= 0xEF -> do
                         w2 <- fetchExtend8
                         w3 <- fetchExtend8
                         let v1 = w1 .&. 0x0F
                             v2 = w2 .&. 0x3F
                             v3 = w3 .&. 0x3F
                         if v1 == 0 && v2 < 0x20
                           then throwException (IllegalRepresentation [w1,w2,w3])
                           else return $ chr $
                                    ((fromIntegral v1) `shiftL` 12)
                                    .|. ((fromIntegral v2) `shiftL` 6)
                                    .|. (fromIntegral v3)
          | w1 <= 0xF7 -> do
                         w2 <- fetchExtend8
                         w3 <- fetchExtend8
                         w4 <- fetchExtend8
                         let v1 = w1 .&. 0x07
                             v2 = w2 .&. 0x3F
                             v3 = w3 .&. 0x3F
                             v4 = w4 .&. 0x3F
                             v = ((fromIntegral v1) `shiftL` 18)
                                    .|. ((fromIntegral v2) `shiftL` 12)
                                    .|. ((fromIntegral v3) `shiftL` 6)
                                    .|. (fromIntegral v4)
                         if v1 == 0 && v2 < 0x10
                           then throwException (IllegalRepresentation [w1,w2,w3,w4])
                           else (if v <= 0x10FFFF
                                 then return $ chr v
                                 else throwException (IllegalRepresentation [w1,w2,w3,w4]))
          | otherwise -> throwException (IllegalCharacter w1)
          where
            invalidExtend wrd = wrd .&. 0xC0 /= 0x80
            fetchExtend8 = do
              w <- fetchWord8
              if invalidExtend w 
                then throwException (IllegalCharacter w)
                else return w