1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
|
{-# LANGUAGE CPP, MagicHash, BangPatterns #-}
-- |
-- Module : Data.Text.Internal.Encoding.Utf8
-- Copyright : (c) 2008, 2009 Tom Harper,
-- (c) 2009, 2010 Bryan O'Sullivan,
-- (c) 2009 Duncan Coutts
--
-- License : BSD-style
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Basic UTF-8 validation and character manipulation.
module Data.Text.Internal.Encoding.Utf8
(
-- Decomposition
ord2
, ord3
, ord4
-- Construction
, chr2
, chr3
, chr4
-- * Validation
, validate1
, validate2
, validate3
, validate4
) where
#if defined(TEST_SUITE)
# undef ASSERTS
#endif
#if defined(ASSERTS)
import Control.Exception (assert)
#endif
import Data.Bits ((.&.))
import Data.Text.Internal.Unsafe.Char (ord)
import Data.Text.Internal.Unsafe.Shift (shiftR)
import GHC.Exts
import GHC.Word (Word8(..))
default(Int)
between :: Word8 -- ^ byte to check
-> Word8 -- ^ lower bound
-> Word8 -- ^ upper bound
-> Bool
between x y z = x >= y && x <= z
{-# INLINE between #-}
ord2 :: Char -> (Word8,Word8)
ord2 c =
#if defined(ASSERTS)
assert (n >= 0x80 && n <= 0x07ff)
#endif
(x1,x2)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
x2 = fromIntegral $ (n .&. 0x3F) + 0x80
ord3 :: Char -> (Word8,Word8,Word8)
ord3 c =
#if defined(ASSERTS)
assert (n >= 0x0800 && n <= 0xffff)
#endif
(x1,x2,x3)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
x3 = fromIntegral $ (n .&. 0x3F) + 0x80
ord4 :: Char -> (Word8,Word8,Word8,Word8)
ord4 c =
#if defined(ASSERTS)
assert (n >= 0x10000)
#endif
(x1,x2,x3,x4)
where
n = ord c
x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
x4 = fromIntegral $ (n .&. 0x3F) + 0x80
chr2 :: Word8 -> Word8 -> Char
chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
!z2# = y2# -# 0x80#
{-# INLINE chr2 #-}
chr3 :: Word8 -> Word8 -> Word8 -> Char
chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!y3# = word2Int# x3#
!z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
!z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
!z3# = y3# -# 0x80#
{-# INLINE chr3 #-}
chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char
chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
C# (chr# (z1# +# z2# +# z3# +# z4#))
where
!y1# = word2Int# x1#
!y2# = word2Int# x2#
!y3# = word2Int# x3#
!y4# = word2Int# x4#
!z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
!z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
!z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
!z4# = y4# -# 0x80#
{-# INLINE chr4 #-}
validate1 :: Word8 -> Bool
validate1 x1 = x1 <= 0x7F
{-# INLINE validate1 #-}
validate2 :: Word8 -> Word8 -> Bool
validate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF
{-# INLINE validate2 #-}
validate3 :: Word8 -> Word8 -> Word8 -> Bool
{-# INLINE validate3 #-}
validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4
where
validate3_1 = (x1 == 0xE0) &&
between x2 0xA0 0xBF &&
between x3 0x80 0xBF
validate3_2 = between x1 0xE1 0xEC &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF
validate3_3 = x1 == 0xED &&
between x2 0x80 0x9F &&
between x3 0x80 0xBF
validate3_4 = between x1 0xEE 0xEF &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF
validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
{-# INLINE validate4 #-}
validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3
where
validate4_1 = x1 == 0xF0 &&
between x2 0x90 0xBF &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF
validate4_2 = between x1 0xF1 0xF3 &&
between x2 0x80 0xBF &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF
validate4_3 = x1 == 0xF4 &&
between x2 0x80 0x8F &&
between x3 0x80 0xBF &&
between x4 0x80 0xBF
|