File: NormalizeStream.hs

package info (click to toggle)
haskell-unicode-transforms 0.4.0.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,312 kB
  • sloc: haskell: 786; sh: 15; makefile: 7
file content (545 lines) | stat: -rw-r--r-- 18,878 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
{-# OPTIONS_GHC -funbox-strict-fields #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TupleSections #-}
-- |
-- Module      : Data.Unicode.Internal.NormalizeStream
-- Copyright   : (c) 2016 Harendra Kumar
--               (c) 2020 Andrew Lelechenko
--
-- License     : BSD-3-Clause
-- Maintainer  : harendra.kumar@gmail.com
-- Stability   : experimental
--
-- Stream based normalization.
--
module Data.Unicode.Internal.NormalizeStream
    (
      UC.DecomposeMode(..)
    , stream
    , unstream
    , unstreamC
    )
    where

import Data.Char (chr, ord)
import GHC.ST (ST(..))
import GHC.Types (SPEC(..))

import qualified Data.Text.Array as A
import qualified Unicode.Char as UC

#if MIN_VERSION_text(2,0,0)
import Data.Text.Internal.Fusion (stream)
#else
import Data.Bits (shiftR)
import Data.Text.Internal.Unsafe.Char (unsafeChr)
import Data.Text.Internal.Fusion.Size (betweenSize)
import Data.Text.Internal.Encoding.Utf16 (chr2)
#endif

-- Internal modules
import Data.Text.Internal (Text(..))
import Data.Text.Internal.Fusion.Size (upperBound)
import Data.Text.Internal.Fusion.Types (Step(..), Stream(..))
import Data.Text.Internal.Private (runText)
import Data.Text.Internal.Unsafe.Char (unsafeWrite)

-------------------------------------------------------------------------------
-- Reorder buffer to hold characters till the next starter boundary
-------------------------------------------------------------------------------

-- | A list of combining characters, ordered by 'UC.combiningClass'.
-- Couple of top levels are unrolled and unpacked for efficiency.
data ReBuf = Empty | One !Char | Many !Char !Char ![Char]

{-# INLINE insertIntoReBuf #-}
insertIntoReBuf :: Char -> ReBuf -> ReBuf
insertIntoReBuf c Empty = One c
insertIntoReBuf c (One c0)
    | UC.combiningClass c < UC.combiningClass c0
    = Many c c0 []
    | otherwise
    = Many c0 c []
insertIntoReBuf c (Many c0 c1 cs)
    | cc < UC.combiningClass c0
    = Many c c0 (c1 : cs)
    | cc < UC.combiningClass c1
    = Many c0 c (c1 : cs)
    | otherwise
    = Many c0 c1 (cs' ++ (c : cs''))
    where
        cc = UC.combiningClass c
        (cs', cs'') = span ((<= cc) . UC.combiningClass) cs

writeStr :: A.MArray s -> Int -> [Char] -> ST s Int
writeStr marr di str = go di str
    where
        go i [] = return i
        go i (c : cs) = do
            n <- unsafeWrite marr i c
            go (i + n) cs

{-# INLINE writeReorderBuffer #-}
writeReorderBuffer :: A.MArray s -> Int -> ReBuf -> ST s Int
writeReorderBuffer _ di Empty = return di

writeReorderBuffer marr di (One c) = do
    n <- unsafeWrite marr di c
    return (di + n)

writeReorderBuffer marr di (Many c1 c2 str) = do
    n1 <- unsafeWrite marr di c1
    n2 <- unsafeWrite marr (di + n1) c2
    writeStr marr (di + n1 + n2) str

-------------------------------------------------------------------------------
-- Decomposition of Hangul characters is done algorithmically
-------------------------------------------------------------------------------

-- {-# INLINE decomposeCharHangul #-}
decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s Int
decomposeCharHangul marr j c =
    if t == chr UC.jamoTFirst then do
        n1 <- unsafeWrite marr j l
        n2 <- unsafeWrite marr (j + n1) v
        return (j + n1 + n2)
    else do
        n1 <- unsafeWrite marr j l
        n2 <- unsafeWrite marr (j + n1) v
        n3 <- unsafeWrite marr (j + n1 + n2) t
        return (j + n1 + n2 + n3)
    where
        (l, v, t) = UC.decomposeHangul c

{-# INLINE decomposeChar #-}
decomposeChar
    :: UC.DecomposeMode
    -> A.MArray s       -- destination array for decomposition
    -> Int              -- array index
    -> ReBuf            -- reorder buffer
    -> Char             -- char to be decomposed
    -> ST s (Int, ReBuf)
decomposeChar mode marr index reBuf ch
    | UC.isHangul ch = do
        j <- writeReorderBuffer marr index reBuf
        (, Empty) <$> decomposeCharHangul marr j ch
    | UC.isDecomposable mode ch =
        decomposeAll marr index reBuf (UC.decompose mode ch)
    | otherwise =
        reorder marr index reBuf ch

    where

    {-# INLINE decomposeAll #-}
    decomposeAll _ i rbuf [] = return (i, rbuf)
    decomposeAll arr i rbuf (x : xs)
        | UC.isDecomposable mode x = do
            (i', rbuf') <- decomposeAll arr i rbuf (UC.decompose mode x)
            decomposeAll arr i' rbuf' xs
        | otherwise  = do
            (i', rbuf') <- reorder arr i rbuf x
            decomposeAll arr i' rbuf' xs

    {-# INLINE reorder #-}
    reorder arr i rbuf c
        | UC.isCombining c = return (i, insertIntoReBuf c rbuf)
        | otherwise = do
            j <- writeReorderBuffer arr i rbuf
            n <- unsafeWrite arr j c
            return (j + n, Empty)

#if !MIN_VERSION_text(2,0,0)
-- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
stream :: Text -> Stream Char
stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
    where
      !end = off+len
      {-# INLINE next #-}
      next !i
          | i >= end                   = Done
          -- shift generates only two branches instead of three in case of
          -- range check, works quite a bit faster with llvm backend.
          | (n `shiftR` 10) == 0x36    = Yield (chr2 n n2) (i + 2)
          | otherwise                  = Yield (unsafeChr n) (i + 1)
          where
            n  = A.unsafeIndex arr i
            n2 = A.unsafeIndex arr (i + 1)
{-# INLINE [0] stream #-}
#endif

-- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'.
unstream :: UC.DecomposeMode -> Stream Char -> Text
unstream mode (Stream next0 s0 len) = runText $ \done -> do
  -- Before encoding each char we perform a buffer realloc check assuming
  -- worst case encoding size of two 16-bit units for the char. Just add an
  -- extra space to the buffer so that we do not end up reallocating even when
  -- all the chars are encoded as single unit.
  let margin = 1 + maxDecomposeLen
      mlen = (upperBound 4 len + margin)
  arr0 <- A.new mlen
  let outer !arr !maxi = encode
       where
        -- keep the common case loop as small as possible
        encode !si !di rbuf =
            -- simply check for the worst case
            if maxi < di + margin
            then realloc si di rbuf
            else
                case next0 si of
                    Done -> do
                        di' <- writeReorderBuffer arr di rbuf
                        done arr di'
                    Skip si'    -> encode si' di rbuf
                    Yield c si' -> do
                                (di', rbuf') <- decomposeChar mode arr di rbuf c
                                encode si' di' rbuf'
                                -- n <- unsafeWrite arr di c
                                -- encode si' (di + n) rbuf

        -- keep uncommon case separate from the common case code
        {-# NOINLINE realloc #-}
        realloc !si !di rbuf = do
            let newlen = maxi * 2
            arr' <- A.new newlen
            A.copyM arr' 0 arr 0 di
            outer arr' (newlen - 1) si di rbuf

  outer arr0 (mlen - 1) s0 0 Empty
{-# INLINE [0] unstream #-}

-- we can generate this from UCD
maxDecomposeLen :: Int
maxDecomposeLen = 32

-------------------------------------------------------------------------------
-- Composition
-------------------------------------------------------------------------------

-- If we are composing we do not need to first decompose Hangul. We can just
-- compose assuming there could be some partially composed syllables e.g. LV
-- syllable followed by a jamo T. We need to compose this case as well.

-- Hold an L to wait for V, hold an LV to wait for T.
data JamoBuf
    = Jamo !Char -- Jamo L, V or T
    | Hangul !Char -- Hangul Syllable LV or LVT
    | HangulLV !Char

data RegBuf
    = RegOne !Char
    | RegMany !Char !Char ![Char]

data ComposeState
    = ComposeNone
    | ComposeReg !RegBuf
    | ComposeJamo !JamoBuf

-------------------------------------------------------------------------------
-- Composition of Jamo into Hangul syllables, done algorithmically
-------------------------------------------------------------------------------

{-# INLINE writeJamoBuf #-}
writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int
writeJamoBuf arr i jbuf = do
    n <- unsafeWrite arr i (getCh jbuf)
    return (i + n)

    where

    getCh (Jamo ch) = ch
    getCh (Hangul ch) = ch
    getCh (HangulLV ch) = ch

{-# INLINE initHangul #-}
initHangul :: Char -> Int -> ST s (Int, ComposeState)
initHangul c i = return (i, ComposeJamo (Hangul c))

{-# INLINE initJamo #-}
initJamo :: Char -> Int -> ST s (Int, ComposeState)
initJamo c i = return (i, ComposeJamo (Jamo c))

{-# INLINE insertJamo #-}
insertJamo
    :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState)
insertJamo arr i jbuf ch
    | ich <= UC.jamoLLast = do
        j <- writeJamoBuf arr i jbuf
        return (j, ComposeJamo (Jamo ch))
    | ich < UC.jamoVFirst =
        flushAndWrite arr i jbuf ch
    | ich <= UC.jamoVLast = do
        case jbuf of
            Jamo c ->
                case UC.jamoLIndex c of
                    Just li ->
                        let vi = ich - UC.jamoVFirst
                            lvi = li * UC.jamoNCount + vi * UC.jamoTCount
                            lv = chr (UC.hangulFirst + lvi)
                         in return (i, ComposeJamo (HangulLV lv))
                    Nothing -> writeTwo arr i c ch
            Hangul c -> writeTwo arr i c ch
            HangulLV c -> writeTwo arr i c ch
    | ich <= UC.jamoTFirst = do
        flushAndWrite arr i jbuf ch
    | otherwise = do
        let ti = ich - UC.jamoTFirst
        case jbuf of
            Jamo c -> writeTwo arr i c ch
            Hangul c
                | UC.isHangulLV c -> do
                    writeLVT arr i c ti
                | otherwise ->
                    writeTwo arr i c ch
            HangulLV c ->
                writeLVT arr i c ti

    where

    ich = ord ch

    {-# INLINE flushAndWrite #-}
    flushAndWrite marr ix jb c = do
        j <- writeJamoBuf marr ix jb
        n <- unsafeWrite marr j c
        return (j + n, ComposeNone)

    {-# INLINE writeLVT #-}
    writeLVT marr ix lv ti = do
        n <- unsafeWrite marr ix (chr ((ord lv) + ti))
        return (ix + n, ComposeNone)

    {-# INLINE writeTwo #-}
    writeTwo marr ix c1 c2 = do
        n <- unsafeWrite marr ix c1
        m <- unsafeWrite marr (ix + n) c2
        return ((ix + n + m), ComposeNone)

{-# INLINE insertHangul #-}
insertHangul
    :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState)
insertHangul arr i jbuf ch = do
    j <- writeJamoBuf arr i jbuf
    return (j, ComposeJamo (Hangul ch))

{-# INLINE insertIntoRegBuf #-}
insertIntoRegBuf :: Char -> RegBuf -> RegBuf
insertIntoRegBuf c (RegOne c0)
    | UC.combiningClass c < UC.combiningClass c0
    = RegMany c c0 []
    | otherwise
    = RegMany c0 c []
insertIntoRegBuf c (RegMany c0 c1 cs)
    | cc < UC.combiningClass c0
    = RegMany c c0 (c1 : cs)
    | cc < UC.combiningClass c1
    = RegMany c0 c (c1 : cs)
    | otherwise
    = RegMany c0 c1 (cs' ++ (c : cs''))
    where
        cc = UC.combiningClass c
        (cs', cs'') = span ((<= cc) . UC.combiningClass) cs

{-# INLINE writeRegBuf #-}
writeRegBuf :: A.MArray s -> Int -> RegBuf -> ST s Int
writeRegBuf arr i = \case
    RegOne c -> do
        n <- unsafeWrite arr i c
        return (i + n)
    RegMany st c [] ->
        case UC.compose st c of
            Just x -> do
                n <- unsafeWrite arr i x
                return (i + n)
            Nothing -> do
                n <- unsafeWrite arr i st
                m <- unsafeWrite arr (i + n) c
                return (i + n + m)
    RegMany st0 c0 cs0 -> go [] st0 (c0 : cs0)

    where

    -- arguments: uncombined chars, starter, unprocessed str
    go uncs st [] = writeStr arr i (st : uncs)
    go uncs st (c : cs) = case UC.compose st c of
        Nothing -> go (uncs ++ (c : same)) st bigger
        Just x  -> go uncs x cs
        where
            cc = UC.combiningClass c
            (same, bigger) = span ((== cc) . UC.combiningClass) cs

{-# INLINE flushComposeState #-}
flushComposeState :: A.MArray s -> Int -> ComposeState -> ST s Int
flushComposeState arr i = \case
    ComposeNone -> pure i
    ComposeReg rbuf -> writeRegBuf arr i rbuf
    ComposeJamo jbuf -> writeJamoBuf arr i jbuf

{-# INLINE composeChar #-}
composeChar
    :: UC.DecomposeMode
    -> A.MArray s       -- destination array for composition
    -> Char             -- input char
    -> Int              -- array index
    -> ComposeState
    -> ST s (Int, ComposeState)
composeChar mode marr = go0

    where

    go0 ch !i !st =
        case st of
            ComposeReg rbuf
                | ich < UC.jamoLFirst ->
                    composeReg rbuf ch i st
                | ich <= UC.jamoTLast -> do
                    j <- writeRegBuf marr i rbuf
                    initJamo ch j
                | ich < UC.hangulFirst ->
                    composeReg rbuf ch i st
                | ich <= UC.hangulLast -> do
                    j <- writeRegBuf marr i rbuf
                    initHangul ch j
                | otherwise ->
                    composeReg rbuf ch i st
            ComposeJamo jbuf
                | ich < UC.jamoLFirst -> do
                    jamoToReg marr i jbuf ch
                | ich <= UC.jamoTLast -> do
                    insertJamo marr i jbuf ch
                | ich < UC.hangulFirst ->
                    jamoToReg marr i jbuf ch
                | ich <= UC.hangulLast -> do
                    insertHangul marr i jbuf ch
                | otherwise ->
                    jamoToReg marr i jbuf ch
            ComposeNone
                | ich < UC.jamoLFirst ->
                    initReg ch i
                | ich <= UC.jamoTLast ->
                    initJamo ch i
                | ich < UC.hangulFirst ->
                    initReg ch i
                | ich <= UC.hangulLast ->
                    initHangul ch i
                | otherwise ->
                    initReg ch i
        where ich = ord ch

    {-# INLINE jamoToReg #-}
    jamoToReg arr i jbuf ch = do
        j <- writeJamoBuf arr i jbuf
        initReg ch j

    {-# INLINE initReg #-}
    initReg !ch !i
        | UC.isDecomposable mode ch =
            go (UC.decompose mode ch) i ComposeNone
        | otherwise =
            pure (i, ComposeReg (RegOne ch))

    {-# INLINE composeReg #-}
    composeReg rbuf !ch !i !st
        | UC.isDecomposable mode ch =
            go (UC.decompose mode ch) i st
        | UC.isCombining ch = do
            pure (i, ComposeReg (insertIntoRegBuf ch rbuf))
        -- The first char in RegBuf may or may not be a starter. In
        -- case it is not we rely on composeStarters failing.
        | RegOne s <- rbuf
        , UC.isCombiningStarter ch
        , Just x <- UC.composeStarters s ch =
            pure (i, (ComposeReg (RegOne x)))
        | otherwise = do
            j <- writeRegBuf marr i rbuf
            pure (j, ComposeReg (RegOne ch))

    go [] !i !st = pure (i, st)
    go (ch : rest) i st =
        case st of
            ComposeReg rbuf
                | UC.isHangul ch -> do
                    j <- writeRegBuf marr i rbuf
                    (k, s) <- initHangul ch j
                    go rest k s
                | UC.isJamo ch -> do
                    j <- writeRegBuf marr i rbuf
                    (k, s) <- initJamo ch j
                    go rest k s
                | UC.isDecomposable mode ch ->
                    go (UC.decompose mode ch ++ rest) i st
                | UC.isCombining ch -> do
                    go rest i (ComposeReg (insertIntoRegBuf ch rbuf))
                | RegOne s <- rbuf
                , UC.isCombiningStarter ch
                , Just x <- UC.composeStarters s ch ->
                    go rest i (ComposeReg (RegOne x))
                | otherwise -> do
                    j <- writeRegBuf marr i rbuf
                    go rest j (ComposeReg (RegOne ch))
            ComposeJamo jbuf
                | UC.isJamo ch -> do
                    (j, s) <- insertJamo marr i jbuf ch
                    go rest j s
                | UC.isHangul ch -> do
                    (j, s) <- insertHangul marr i jbuf ch
                    go rest j s
                | otherwise -> do
                    j <- writeJamoBuf marr i jbuf
                    case () of
                        _
                            | UC.isDecomposable mode ch ->
                                go (UC.decompose mode ch ++ rest) j
                                   ComposeNone
                            | otherwise ->
                                go rest j (ComposeReg (RegOne ch))
            ComposeNone
                | UC.isHangul ch -> do
                    (j, s) <- initHangul ch i
                    go rest j s
                | UC.isJamo ch -> do
                    (j, s) <- initJamo ch i
                    go rest j s
                | UC.isDecomposable mode ch ->
                    go (UC.decompose mode ch ++ rest) i st
                | otherwise ->
                    go rest i (ComposeReg (RegOne ch))

-- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'.
unstreamC :: UC.DecomposeMode -> Stream Char -> Text
unstreamC mode (Stream next0 s0 len) = runText $ \done -> do
  -- Before encoding each char we perform a buffer realloc check assuming
  -- worst case encoding size of two 16-bit units for the char. Just add an
  -- extra space to the buffer so that we do not end up reallocating even when
  -- all the chars are encoded as single unit.
  let margin = 1 + maxDecomposeLen
      mlen = (upperBound 4 len + margin)
  arr0 <- A.new mlen
  let outer !arr !maxi = encode SPEC
       where
        -- keep the common case loop as small as possible
        encode !_ !si !di st =
            -- simply check for the worst case
            if maxi < di + margin
               then realloc si di st
            else
                case next0 si of
                    Done -> do
                        di' <- flushComposeState arr di st
                        done arr di'
                    Skip si'    -> encode SPEC si' di st
                    Yield c si' -> do
                        (di', st') <- composeChar mode arr c di st
                        encode SPEC si' di' st'

        -- keep uncommon case separate from the common case code
        {-# NOINLINE realloc #-}
        realloc !si !di st = do
            let newlen = maxi * 2
            arr' <- A.new newlen
            A.copyM arr' 0 arr 0 di
            outer arr' (newlen - 1) si di st

  outer arr0 (mlen - 1) s0 0 ComposeNone
{-# INLINE [0] unstreamC #-}