File: UCD2Haskell.hs

package info (click to toggle)
haskell-unicode-transforms 0.3.6-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 3,672 kB
  • sloc: haskell: 10,801; makefile: 7
file content (397 lines) | stat: -rw-r--r-- 14,975 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
{-# LANGUAGE DeriveAnyClass      #-}
{-# LANGUAGE DeriveGeneric       #-}
{-# LANGUAGE RecordWildCards     #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell     #-}
{-# LANGUAGE TupleSections       #-}

-- |
-- Module      : Script to parse Unicode XML database and convert
--               it to Haskell data structures
--
-- Copyright   : (c) 2014–2015 Antonio Nikishaev
--               (c) 2016-2017 Harendra Kumar
--
-- License     : BSD-style
-- Maintainer  : harendra.kumar@gmail.com
-- Stability   : experimental
--
--
module Main where

import           Prelude hiding (pred)
import           Control.DeepSeq      (NFData (..), deepseq)
import           Control.Exception
import           Data.Binary          as Bin
import qualified Data.ByteString.Lazy as L
import           Data.Char            (chr)
import           Data.Char            (ord)
import           Data.Map             ((!))
import qualified Data.Map             as M
import           Data.Monoid          ((<>))
import qualified Data.Set             as S
import           GHC.Generics         (Generic)
import           System.FilePath      ((-<.>))
import           Text.HTML.TagSoup    (Tag (..), parseTags)
import           WithCli              (withCli)

import           Data.Unicode.Properties.DecomposeHangul (isHangul)

data GeneralCategory =
    Lu|Ll|Lt|             --LC
    Lm|Lo|                --L
    Mn|Mc|Me|             --M
    Nd|Nl|No|             --N
    Pc|Pd|Ps|Pe|Pi|Pf|Po| --P
    Sm|Sc|Sk|So|          --S
    Zs|Zl|Zp|             --Z
    Cc|Cf|Cs|Co|Cn        --C
    deriving (Show, Read, Generic, NFData, Binary)

data DecompType =
       DTCanonical | DTCompat  | DTFont
     | DTNoBreak   | DTInitial | DTMedial   | DTFinal
     | DTIsolated  | DTCircle  | DTSuper    | DTSub
     | DTVertical  | DTWide    | DTNarrow
     | DTSmall     | DTSquare  | DTFraction
    deriving (Show,Eq,Generic, NFData, Binary)

data Decomp = DCSelf | DC [Char] deriving (Show,Eq,Generic, NFData, Binary)
data QCValue = QCYes | QCNo | QCMaybe deriving (Show,Generic, NFData, Binary)

data CharProps = CharProps {
      _name                       :: String,
      _generalCategory            :: GeneralCategory,
      _upper                      :: Bool,
      _lower                      :: Bool,
      _otherUpper                 :: Bool,
      _otherLower                 :: Bool,
      _nfc_qc                     :: QCValue,
      _nfd_qc                     :: Bool,
      _nfkc_qc                    :: QCValue,
      _nfkd_qc                    :: Bool,
      _combiningClass             :: Int,
      _dash                       :: Bool,
      _hyphen                     :: Bool,
      _quotationMark              :: Bool,
      _terminalPunctuation        :: Bool,
      _diactric                   :: Bool,
      _extender                   :: Bool,
      _decomposition              :: Decomp,
      _decompositionType          :: Maybe DecompType,
      _fullDecompositionExclusion :: Bool
} deriving (Show,Generic, NFData, Binary)

-------------------------------------------------------------------------------
-- Generate data structures for decompositions
-------------------------------------------------------------------------------

genSignature :: String -> String
genSignature testBit = testBit <> " :: Char -> Bool"

genRangeCheck :: String -> [Int] -> String
genRangeCheck testBit ordList =
      testBit <> " c | (ord c) < "
      <> show (minimum ordList) <> " || (ord c) > "
      <> show (maximum ordList) <> " = False"

genMinMax :: String -> [Int] -> String
genMinMax prefix ordList = unlines
    [ ""
    , prefix <> "Min, " <> prefix <> "Max :: Int"
    , prefix <> "Min = " <> show (minimum ordList)
    , prefix <> "Max = " <> show (maximum ordList)
    ]

genBitmap :: String -> [Int] -> String
genBitmap prefix ordList =
  -- On ARM, compilation fails with llvm optimizer crashing when one big list
  -- is used. Split it into two to avoid the problem.
  let l  = length ordList
      mn = minimum ordList
      mx = maximum ordList
      (ordList1, ordList2) = splitAt (div l 2) ordList
  in  unlines
    [ "bitList1, bitList2 :: [Int]"
    , "bitList1 = " ++ show ordList1
    , "bitList2 = " ++ show ordList2
    , ""
    , prefix <> "Bitmap :: BitArray"
    , prefix <> "Bitmap = bitArraySetBits "
      ++ (show (mn, mx))
      ++ " $ bitList1 ++ bitList2"
    ]

genCombiningClass :: PropertiesDB -> String -> String
genCombiningClass props file = unlines
            [ "-- autogenerated from Unicode data"
            , "module Data.Unicode.Properties." <> file
            , "(getCombiningClass, isCombining)"
            , "where"
            , ""
            , "import Data.Char (ord)"
            , "import Data.BitArray (BitArray, lookupBit)"
            , "import Data.Unicode.Properties.BitArray (bitArraySetBits)"
            , ""
            , "getCombiningClass :: Char -> Int"
            , concat $ map genCombiningClassDef ccmap
            , "getCombiningClass _ = 0\n"
            , ""
            , "{-# INLINE isCombining #-}"
            , genSignature  "isCombining"
            , genRangeCheck "isCombining" ordList
            , "isCombining c = lookupBit combiningBitmap (ord c)"
            , genBitmap "combining" ordList
            ]
    where
        genCombiningClassDef (c, d) =
            "getCombiningClass " <> show c <> " = " <> show d <> "\n"

        ccmap = (filter (\(_,cc) -> cc /= 0)
                 . map (\(c,prop) -> (c, _combiningClass prop))) props

        ordList = map (ord . fst) ccmap

data DType = Canonical | Kompat

decompositions :: DType -> PropertiesDB -> [(Char, [Char])]
decompositions dtype =
      map    (\(c, prop) -> (c, decomposeChar c (_decomposition prop)))
    . filter (predicate   . _decompositionType . snd)
    . filter ((/= DCSelf) . _decomposition . snd)
    where predicate = case dtype of
              Canonical -> (== Just DTCanonical)
              Kompat    -> (const True)

genDecomposable :: DType -> PropertiesDB -> String -> String
genDecomposable dtype props file = unlines
            [ "-- autogenerated from Unicode data"
            , "module Data.Unicode.Properties." <> file
            , "(decomposeBitmap, decomposeMax, decomposeMin)"
            , "where"
            , ""
            , "import Data.BitArray (BitArray)"
            , "import Data.Unicode.Properties.BitArray (bitArraySetBits)"
            , ""
            , genMinMax "decompose" ordList
            , genBitmap "decompose" ordList
            ]
    where
        chrList = filter (not . isHangul)
                         (map fst (decompositions dtype props))
        ordList = map ord chrList

decomposeChar :: Char -> Decomp -> [Char]
decomposeChar c DCSelf   = [c]
decomposeChar _c (DC ds) = ds

genDecomposeModuleHdr :: String -> String
genDecomposeModuleHdr file = unlines
    [ "{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}"
    , "-- autogenerated from Unicode data"
    , "module Data.Unicode.Properties." <> file
    , "(decomposeChar)"
    , "where"
    ]

genDecomposeSign :: String
genDecomposeSign = unlines
    [ ""
    , "-- Note: this is a partial function we do not expect to call"
    , "-- this if isDecomposable returns false."
    , "{-# NOINLINE decomposeChar #-}"
    , "decomposeChar :: Char -> [Char]"
    ]

genDecomposeDefs :: DType -> PropertiesDB -> (Int -> Bool) -> String
genDecomposeDefs dtype props pred =
    concat $ map (genDecomposeDef "decomposeChar") decomps
    where
        decomps =
              filter (pred . ord . fst)
            . filter (not . isHangul . fst)
            $ (decompositions dtype props)
        genDecomposeDef name (c, d) =
            name <> " " <> show c <> " = " <> show d <> "\n"

genDecompositions :: PropertiesDB -> String -> String
genDecompositions props file = unlines
            [ genDecomposeModuleHdr file
            , genDecomposeSign
            , genDecomposeDefs Canonical props (const True)
            ]

-- Compatibility decompositions are split in two parts to keep the file sizes
-- short enough
genDecompositionsK :: PropertiesDB -> String -> String
genDecompositionsK props file = unlines
            [ genDecomposeModuleHdr file
            , "import qualified Data.Unicode.Properties.DecompositionsK2 as DK2"
            , genDecomposeSign
            , genDecomposeDefs Kompat props (< 60000)
            , "decomposeChar c = DK2.decomposeChar c"
            ]

genDecompositionsK2 :: PropertiesDB -> String -> String
genDecompositionsK2 props file = unlines
            [ genDecomposeModuleHdr file
            , genDecomposeSign
            , genDecomposeDefs Kompat props (>= 60000)
            ]

-------------------------------------------------------------------------------
-- Generate data structures for compositions
-------------------------------------------------------------------------------

genCompositions :: PropertiesDB -> String -> String
genCompositions props file = unlines
            [ "{-# OPTIONS_GHC -fno-warn-incomplete-patterns #-}"
            , "-- autogenerated from Unicode data"
            , "module Data.Unicode.Properties." <> file
            , "(composePair)"
            , "where"
            , ""
            , "{-# NOINLINE composePair #-}"
            , "composePair :: Char -> Char -> Maybe Char"
            , concat $ map (genComposePair "composePair") decomps
            , "composePair _ _ = " <> "Nothing" <> "\n"
            ]
    where
        genComposePair name (c, [d1, d2]) =
            name <> " " <> show d1 <> " " <> show d2 <> " = Just " <> show c <> "\n"
        genComposePair _ _ = error "Bug: decomp length is not 2"

        decomps =   filter ((flip S.notMember) exclusions . fst)
                  . filter (not . isHangul . fst)
                  . filter ((== 2) . length . snd)
                  $ (decompositions Canonical props)

        exclusions =  S.fromList
                    . map fst
                    . filter (_fullDecompositionExclusion . snd) $ props

-------------------------------------------------------------------------------
-- Create and read binary properties data
-------------------------------------------------------------------------------

readSavedProps :: FilePath -> IO [(Char, CharProps)]
readSavedProps file = Bin.decode <$> L.readFile file

writeBinary :: Binary a => FilePath -> a -> IO ()
writeBinary file props = do
  L.writeFile file (Bin.encode props)

type PropertiesDB = [(Char,CharProps)]

readQCValue :: String -> QCValue
readQCValue "Y" = QCYes
readQCValue "N" = QCNo
readQCValue "M" = QCMaybe
readQCValue x = error $ "Unknown QCValue: " ++ show x

readYN :: String -> Bool
readYN "Y" = True
readYN "N" = False
readYN x = error $ "Unknown YNValue: " ++ show x

readCodePoint :: String -> Char
readCodePoint = chr . read . ("0x"++)

readDecomp :: String -> Decomp
readDecomp "#" = DCSelf
readDecomp s   = DC . map readCodePoint $ words s

readDecompType :: String -> Maybe DecompType
readDecompType "none" = Nothing
readDecompType s      = Just (dtmap!s)
    where
        dtmap = M.fromList
            [
              ("can"       , DTCanonical)
            , ("com"       , DTCompat   )
            , ("enc"       , DTCircle   )
            , ("fin"       , DTFinal    )
            , ("font"      , DTFont     )
            , ("fra"       , DTFraction )
            , ("init"      , DTInitial  )
            , ("iso"       , DTIsolated )
            , ("med"       , DTMedial   )
            , ("nar"       , DTNarrow   )
            , ("nb"        , DTNoBreak  )
            , ("sml"       , DTSmall    )
            , ("sqr"       , DTSquare   )
            , ("sub"       , DTSub      )
            , ("sup"       , DTSuper    )
            , ("vert"      , DTVertical )
            , ("wide"      , DTWide     )
            ]

toProp :: Tag String -> PropertiesDB
toProp (TagOpen _ psml) = [ (c, CharProps{..}) | c <- cps ]
    where
        psm = M.fromList psml
        cps = let readCP = (fmap readCodePoint . (`M.lookup` psm))
              in case readCP <$> ["cp", "first-cp", "last-cp"] of
                [Just c , Nothing, Nothing] -> [c]
                [Nothing, Just c1, Just c2] -> [c1..c2]
                _                           -> undefined

        _name                       =                  psm!"na"
        _generalCategory            = read           $ psm!"gc"
        _nfd_qc                     = readYN         $ psm!"NFD_QC"
        _nfkd_qc                    = readYN         $ psm!"NFKD_QC"
        _nfc_qc                     = readQCValue    $ psm!"NFC_QC"
        _nfkc_qc                    = readQCValue    $ psm!"NFKC_QC"
        _upper                      = readYN         $ psm!"Upper"
        _otherUpper                 = readYN         $ psm!"OUpper"
        _lower                      = readYN         $ psm!"Lower"
        _otherLower                 = readYN         $ psm!"OLower"
        _combiningClass             = read           $ psm!"ccc"
        _dash                       = readYN         $ psm!"Dash"
        _hyphen                     = readYN         $ psm!"Hyphen"
        _quotationMark              = readYN         $ psm!"QMark"
        _terminalPunctuation        = readYN         $ psm!"Term"
        _diactric                   = readYN         $ psm!"Dia"
        _extender                   = readYN         $ psm!"Ext"
        _decomposition              = readDecomp     $ psm!"dm"
        _decompositionType          = readDecompType $ psm!"dt"
        _fullDecompositionExclusion = readYN         $ psm!"Comp_Ex"
toProp _ = undefined

-- | Extract char properties from UCD XML file
xmlToProps :: FilePath -> FilePath -> IO [(Char, CharProps)]
xmlToProps src dst = do
  input <- readFile src
  let props = concatMap toProp (filter isChar $ parseTags input)
              :: [(Char,CharProps)]
  props `deepseq` writeBinary dst props
  return props

  where isChar (TagOpen "char" _) = True
        isChar _                  = False

-- | Convert the unicode data file (ucd.all.flat.xml) to Haskell data
-- structures
processFile :: FilePath -> FilePath -> IO ()
processFile src outdir = do
    props <- (readSavedProps dst
              `catch` \(_e::IOException) -> xmlToProps src dst)
    -- print $ length props
    emitFile "Decomposable"    $ genDecomposable   Canonical props
    emitFile "DecomposableK"   $ genDecomposable   Kompat    props

    emitFile "Decompositions"   $ genDecompositions props
    emitFile "DecompositionsK"  $ genDecompositionsK props
    emitFile "DecompositionsK2" $ genDecompositionsK2 props

    emitFile "Compositions"    $ genCompositions   props
    emitFile "CombiningClass"  $ genCombiningClass props

    where
        -- properties db file
        dst = src -<.> ".pdb"
        emitFile name gen =
            writeFile (outdir <> "/" <> name <> ".hs") $ gen name

main :: IO ()
main = withCli processFile