File: Helper.hs

package info (click to toggle)
haskell-basement 0.0.16-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,048 kB
  • sloc: haskell: 11,336; ansic: 63; makefile: 5
file content (246 lines) | stat: -rw-r--r-- 9,228 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
-- |
-- Module      : Basement.UTF8.Helper
-- License     : BSD-style
-- Maintainer  : Foundation
--
-- Some low level helpers to use UTF8
--
-- Most helpers are lowlevel and unsafe, don't use
-- directly.
{-# LANGUAGE BangPatterns               #-}
{-# LANGUAGE MagicHash                  #-}
{-# LANGUAGE NoImplicitPrelude          #-}
{-# LANGUAGE TypeFamilies               #-}
{-# LANGUAGE FlexibleContexts           #-}
{-# LANGUAGE CPP                        #-}
module Basement.UTF8.Helper
    where

import           Basement.Compat.Base
import           Basement.Compat.Primitive
import           Basement.Types.OffsetSize
import           Basement.UTF8.Types
import           Basement.Bits
import           GHC.Prim
import           GHC.Types
import           GHC.Word

-- mask an UTF8 continuation byte (stripping the leading 10 and returning 6 valid bits)
maskContinuation# :: Word# -> Word#
maskContinuation# v = and# v 0x3f##
{-# INLINE maskContinuation# #-}

-- mask a UTF8 header for 2 bytes encoding (110xxxxx and 5 valid bits)
maskHeader2# :: Word# -> Word#
maskHeader2# h = and# h 0x1f##
{-# INLINE maskHeader2# #-}

-- mask a UTF8 header for 3 bytes encoding (1110xxxx and 4 valid bits)
maskHeader3# :: Word# -> Word#
maskHeader3# h = and# h 0xf##
{-# INLINE maskHeader3# #-}

-- mask a UTF8 header for 4 bytes encoding (11110xxx and 3 valid bits)
maskHeader4# :: Word# -> Word#
maskHeader4# h = and# h 0x7##
{-# INLINE maskHeader4# #-}

or3# :: Word# -> Word# -> Word# -> Word#
or3# a b c = or# a (or# b c)
{-# INLINE or3# #-}

or4# :: Word# -> Word# -> Word# -> Word# -> Word#
or4# a b c d = or# (or# a b) (or# c d)
{-# INLINE or4# #-}

toChar# :: Word# -> Char
toChar# w = C# (chr# (word2Int# w))
{-# INLINE toChar# #-}

toChar1 :: StepASCII -> Char
toChar1 (StepASCII (W8# w)) = C# (word8ToChar# w)

toChar2 :: StepASCII -> Word8 -> Char
toChar2 (StepASCII (W8# b1)) (W8# b2) =
    toChar# (or# (uncheckedShiftL# (maskHeader2# w1) 6#) (maskContinuation# w2))
  where
    w1 = word8ToWord# b1
    w2 = word8ToWord# b2

toChar3 :: StepASCII -> Word8 -> Word8 -> Char
toChar3 (StepASCII (W8# b1)) (W8# b2) (W8# b3) =
    toChar# (or3# (uncheckedShiftL# (maskHeader3# w1) 12#)
                  (uncheckedShiftL# (maskContinuation# w2) 6#)
                  (maskContinuation# w3)
            )
  where
    w1 = word8ToWord# b1
    w2 = word8ToWord# b2
    w3 = word8ToWord# b3

toChar4 :: StepASCII -> Word8 -> Word8 -> Word8 -> Char
toChar4 (StepASCII (W8# b1)) (W8# b2) (W8# b3) (W8# b4) =
    toChar# (or4# (uncheckedShiftL# (maskHeader4# w1) 18#)
                  (uncheckedShiftL# (maskContinuation# w2) 12#)
                  (uncheckedShiftL# (maskContinuation# w3) 6#)
                  (maskContinuation# w4)
            )
  where
    w1 = word8ToWord# b1
    w2 = word8ToWord# b2
    w3 = word8ToWord# b3
    w4 = word8ToWord# b4

-- | Different way to encode a Character in UTF8 represented as an ADT
data UTF8Char =
      UTF8_1 {-# UNPACK #-} !Word8
    | UTF8_2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
    | UTF8_3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
    | UTF8_4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8

-- | Transform a Unicode code point 'Char' into
--
-- note that we expect here a valid unicode code point in the *allowed* range.
-- bits will be lost if going above 0x10ffff
asUTF8Char :: Char -> UTF8Char
asUTF8Char !(C# c)
  | bool# (ltWord# x 0x80##   ) = encode1
  | bool# (ltWord# x 0x800##  ) = encode2
  | bool# (ltWord# x 0x10000##) = encode3
  | otherwise                   = encode4
    where
      !x = int2Word# (ord# c)

      encode1 = UTF8_1 (W8# (wordToWord8# x))
      encode2 =
          let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 6#) 0xc0##))
              !x2 = toContinuation x
           in UTF8_2 x1 x2
      encode3 =
          let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 12#) 0xe0##))
              !x2 = toContinuation (uncheckedShiftRL# x 6#)
              !x3 = toContinuation x
           in UTF8_3 x1 x2 x3
      encode4 =
          let !x1 = W8# (wordToWord8# (or# (uncheckedShiftRL# x 18#) 0xf0##))
              !x2 = toContinuation (uncheckedShiftRL# x 12#)
              !x3 = toContinuation (uncheckedShiftRL# x 6#)
              !x4 = toContinuation x
           in UTF8_4 x1 x2 x3 x4

      toContinuation :: Word# -> Word8
      toContinuation w = W8# (wordToWord8# (or# (and# w 0x3f##) 0x80##))
      {-# INLINE toContinuation #-}

-- given the encoding of UTF8 Char, get the number of bytes of this sequence
numBytes :: UTF8Char -> CountOf Word8
numBytes UTF8_1{} = CountOf 1
numBytes UTF8_2{} = CountOf 2
numBytes UTF8_3{} = CountOf 3
numBytes UTF8_4{} = CountOf 4

-- given the leading byte of a utf8 sequence, get the number of bytes of this sequence
skipNextHeaderValue :: Word8 -> CountOf Word8
skipNextHeaderValue !x
    | x < 0xC0  = CountOf 1 -- 0b11000000
    | x < 0xE0  = CountOf 2 -- 0b11100000
    | x < 0xF0  = CountOf 3 -- 0b11110000
    | otherwise = CountOf 4
{-# INLINE skipNextHeaderValue #-}

headerIsAscii :: StepASCII -> Bool
headerIsAscii (StepASCII x) = x < 0x80

charToBytes :: Int -> CountOf Word8
charToBytes c
    | c < 0x80     = CountOf 1
    | c < 0x800    = CountOf 2
    | c < 0x10000  = CountOf 3
    | c < 0x110000 = CountOf 4
    | otherwise    = error ("invalid code point: " `mappend` show c)

-- | Encode a Char into a CharUTF8
encodeCharUTF8 :: Char -> CharUTF8
encodeCharUTF8 !(C# c)
    | bool# (ltWord# x 0x80##   ) = CharUTF8 (W32# (wordToWord32# x))
    | bool# (ltWord# x 0x800##  ) = CharUTF8 (W32# (wordToWord32# encode2))
    | bool# (ltWord# x 0x10000##) = CharUTF8 (W32# (wordToWord32# encode3))
    | otherwise                   = CharUTF8 (W32# (wordToWord32# encode4))
  where
    !x = int2Word# (ord# c)

    -- clearing mask, clearing all the bits that need to be clear as per the UTF8 encoding
    mask2 = 0x0000bfdf## -- 1 continuation , 5 bits header
    mask3 = 0x00bfbfef## -- 2 continuations, 4 bits header
    mask4 = 0xbfbfbff7## -- 3 continuations, 3 bits header

    -- setting mask, settings all the bits that need to be set per the UTF8 encoding
    set2  = 0x000080c0## -- 10xxxxxx     110xxxxx
    set3  = 0x008080e0## -- 10xxxxxx * 2 1110xxxx
    set4  = 0x808080f0## -- 10xxxxxx * 3 11111xxx

    encode2 = and# mask2 (or3# set2
                               (uncheckedShiftRL# x 6#) -- 5 bits to 1st byte
                               (uncheckedShiftL# x 8# ) -- move lowest bits to the 2nd byte
                         )
    encode3 = and# mask3 (or4# set3
                               (uncheckedShiftRL# x 12#) -- 4 bits to 1st byte
                               (and# 0x3f00## (uncheckedShiftL# x 2#)) -- 6 bits to the 2nd byte
                               (uncheckedShiftL# x 16# ) -- move lowest bits to the 3rd byte
                         )
    encode4 = and# mask4 (or4# set4
                               (uncheckedShiftRL# x 18#) -- 3 bits to 1st byte
                               (or# (and# 0x3f00## (uncheckedShiftRL# x 4#))   -- 6 bits to the 2nd byte
                                    (and# 0x3f0000## (uncheckedShiftL# x 10#)) -- 6 bits to the 3nd byte
                               )
                               (uncheckedShiftL# x 24# ) -- move lowest bits to the 4rd byte
                         )

-- | decode a CharUTF8 into a Char
--
-- If the value inside a CharUTF8 is not properly encoded, this will result in violation
-- of the Char invariants
decodeCharUTF8 :: CharUTF8 -> Char
decodeCharUTF8 c@(CharUTF8 !(W32# w_))
    | isCharUTF8Case1 c = toChar# w
    | isCharUTF8Case2 c = encode2
    | isCharUTF8Case3 c = encode3
    | otherwise         = encode4
  where
    w = word32ToWord# w_
    encode2 =
        toChar# (or# (uncheckedShiftL# (maskHeader2# w) 6#)
                     (maskContinuation# (uncheckedShiftRL# w 8#))
                )
    encode3 =
        toChar# (or3# (uncheckedShiftL# (maskHeader3# w) 12#)
                      (uncheckedShiftRL# (and# 0x3f00## w) 8#)
                      (maskContinuation# (uncheckedShiftRL# w 16#))
                )
    encode4 =
        toChar# (or4# (uncheckedShiftL#  (maskHeader4# w) 18#)
                      (uncheckedShiftRL# (and# 0x3f00## w) 10#)
                      (uncheckedShiftL#  (and# 0x3f0000## w) 4#)
                      (maskContinuation# (uncheckedShiftRL# w 24#))
                )

    -- clearing mask, removing all UTF8 metadata and keeping only signal (content)
    --maskContent2 = 0x00003f1f## -- 1 continuation , 5 bits header
    --maskContent3 = 0x003f3f0f## -- 2 continuations, 4 bits header
    --maskContent4 = 0x3f3f3f07## -- 3 continuations, 3 bits header

isCharUTF8Case1 :: CharUTF8 -> Bool
isCharUTF8Case1 (CharUTF8 !w) = (w .&. 0x80) == 0
{-# INLINE isCharUTF8Case1 #-}

isCharUTF8Case2 :: CharUTF8 -> Bool
isCharUTF8Case2 (CharUTF8 !w) = (w .&. 0x20) == 0
{-# INLINE isCharUTF8Case2 #-}

isCharUTF8Case3 :: CharUTF8 -> Bool
isCharUTF8Case3 (CharUTF8 !w) = (w .&. 0x10) == 0
{-# INLINE isCharUTF8Case3 #-}

isCharUTF8Case4 :: CharUTF8 -> Bool
isCharUTF8Case4 (CharUTF8 !w) = (w .&. 0x08) == 0
{-# INLINE isCharUTF8Case4 #-}