File: Normalization.hs

package info (click to toggle)
haskell-unicode-data 0.3.1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 5,156 kB
  • sloc: haskell: 26,262; makefile: 3
file content (168 lines) | stat: -rw-r--r-- 5,421 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
-- |
-- Module      : Unicode.Char.Normalization
-- Copyright   : (c) 2020 Composewell Technologies and Contributors
-- License     : Apache-2.0
-- Maintainer  : streamly@composewell.com
-- Stability   : experimental
--
-- Low level Unicode database functions to facilitate Unicode normalization.
--
-- For more information on Unicode normalization please refer to the following
-- sections of the [Unicode standard](https://www.unicode.org/versions/latest/):
--
-- * 2 General Structure
--
--     * 2.3 Compatibility Characters
--     * 2.12 Equivalent Sequences
--
-- * 3 Conformance
--
--     * 3.6 Combination
--     * 3.7 Decomposition
--     * 3.11 Normalization Forms
--     * 3.12 Conjoining Jamo Behavior
--
-- * 4 Character Properties
--
--     * 4.3 Combining Classes
--
-- * [UnicodeĀ® Standard Annex #15 - Unicode Normalization Forms](https://www.unicode.org/reports/tr15)
-- * [UnicodeĀ® Standard Annex #44 - Unicode Character Database](https://www.unicode.org/reports/tr44/)
--

module Unicode.Char.Normalization
    (
    -- * Combining class
      isCombining
    , combiningClass
    , isCombiningStarter

    -- * Composition
    , compose
    , composeStarters

    -- * Decomposition
    -- ** Non-Hangul
    , DecomposeMode(..)
    , isDecomposable
    , decompose

    -- ** Hangul
    , decomposeHangul
    )
where

import Control.Exception (assert)
import Data.Char (ord)
import GHC.Base (unsafeChr)
import Unicode.Internal.Division (quotRem21, quotRem28)
import Unicode.Char.General
    (hangulFirst, jamoLFirst, jamoTCount, jamoTFirst, jamoVCount, jamoVFirst)

import qualified Unicode.Internal.Char.UnicodeData.CombiningClass  as CC
import qualified Unicode.Internal.Char.UnicodeData.Compositions    as C
import qualified Unicode.Internal.Char.UnicodeData.Decomposable    as D
import qualified Unicode.Internal.Char.UnicodeData.DecomposableK   as K
import qualified Unicode.Internal.Char.UnicodeData.Decompositions  as D
import qualified Unicode.Internal.Char.UnicodeData.DecompositionsK as K

-------------------------------------------------------------------------------
-- Compose
-------------------------------------------------------------------------------

-- | Compose a starter character (combining class 0) with a combining character
-- (non-zero combining class). Returns the composed character if the starter
-- combines with the combining character, returns 'Nothing' otherwise.
--
-- @since 0.1.0
{-# INLINE compose #-}
compose :: Char -> Char -> Maybe Char
compose = C.compose

-- | Compose a starter character with another starter character.  Returns the
-- composed character if the two starters combine, returns 'Nothing' otherwise.
--
-- @since 0.1.0
{-# INLINE composeStarters #-}
composeStarters :: Char -> Char -> Maybe Char
composeStarters = C.composeStarters

-- | Return 'True' if a starter character may combine with some preceding
-- starter character.
--
-- @since 0.1.0
{-# INLINE isCombiningStarter #-}
isCombiningStarter :: Char -> Bool
isCombiningStarter = C.isSecondStarter

-------------------------------------------------------------------------------
-- Decompose
-------------------------------------------------------------------------------

-------------------------------------------------------------------------------
-- Non Hangul decomposition
-------------------------------------------------------------------------------

-- | Whether we are decomposing in canonical or compatibility mode.
--
-- @since 0.1.0
data DecomposeMode = Canonical | Kompat

-- | Decompose a non-Hangul character into its canonical or compatibility
-- decompositions.  Note that the resulting characters may further decompose.
--
-- @since 0.1.0
{-# INLINE decompose #-}
decompose :: DecomposeMode -> Char -> [Char]
decompose Canonical  = D.decompose
decompose Kompat = K.decompose

-- | Given a non-Hangul character determine if the character is decomposable.
-- Note that in case compatibility decompositions a character may decompose
-- into a single compatibility character.
--
-- @since 0.1.0
{-# INLINE isDecomposable #-}
isDecomposable :: DecomposeMode -> Char -> Bool
isDecomposable Canonical  = D.isDecomposable
isDecomposable Kompat = K.isDecomposable

-------------------------------------------------------------------------------
-- Hangul decomposition
-------------------------------------------------------------------------------

-- | Decompose a Hangul syllable into its corresponding Jamo characters.
--
-- @since 0.1.0
{-# INLINE decomposeHangul #-}
decomposeHangul :: Char -> (Char, Char, Char)
decomposeHangul c = (l, v, t)

    where

    i = ord c - hangulFirst
    !(tn, ti) = assert (jamoTCount == 28) quotRem28 i
    !(li, vi) = assert (jamoVCount == 21) quotRem21 tn
    l = unsafeChr (jamoLFirst + li)
    v = unsafeChr (jamoVFirst + vi)
    t = unsafeChr (jamoTFirst + ti)

-------------------------------------------------------------------------------
-- Combining class
-------------------------------------------------------------------------------

-- Determine the combining properties of characters.

-- | Returns the combining class of a character.
--
-- @since 0.1.0
{-# INLINE combiningClass #-}
combiningClass :: Char -> Int
combiningClass = CC.combiningClass

-- | Returns 'True' if a character is a combining character.
--
-- @since 0.1.0
{-# INLINE isCombining #-}
isCombining :: Char -> Bool
isCombining = CC.isCombining