File: NormalizationTest.hs

package info (click to toggle)
haskell-unicode-transforms 0.4.0.1-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 3,308 kB
  • sloc: haskell: 786; sh: 15; makefile: 3
file content (129 lines) | stat: -rw-r--r-- 4,318 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
{-# LANGUAGE CPP               #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections     #-}

-- |
-- Copyright   : (c) 2016 Harendra Kumar
--
-- License     : BSD-3-Clause
-- Maintainer  : harendra.kumar@gmail.com
-- Stability   : experimental
-- Portability : GHC
--

import Control.Monad (when)
import qualified Data.ByteString as B
import Data.Char (chr, isSpace, ord, toUpper)
#if MIN_VERSION_base(4,8,0)
import Data.Function ((&))
#endif
import Data.List (intercalate, isPrefixOf)
import Data.List.Split (splitOn)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.Text.Normalize (NormalizationMode(NFD, NFKD, NFC, NFKC), normalize)
import Text.Printf (printf)

#if !MIN_VERSION_base(4,8,0)
(&) :: a -> (a -> b) -> b
x & f = f x
#endif

chrToHex :: Char -> [Char]
chrToHex = (map toUpper) . (printf "%.4x") . ord

strToHex :: [Char] -> String
strToHex = unwords . (map chrToHex)

checkEqual :: String -> (Text -> Text) -> (Text, Text) -> IO Bool
checkEqual opName op (c1, c2) =
    if c1 /= op c2 then do
        putStrLn $ opName ++ " "            ++ txtToHex c2
                          ++ " = "          ++ txtToHex (op c2)
                          ++ "; Expected: " ++ txtToHex c1
        return False
    else return True
    where
        txtToHex = strToHex . T.unpack

checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool
checkOp name op pairs = do
    res <- mapM (checkEqual name ((normalize op))) pairs
    return $ all (== True) res

checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFC (c1, c2, c3, c4, c5) =
    checkOp "toNFC" NFC $
            concat [ map (c2,) [c1, c2, c3]
                   , map (c4,) [c4, c5]
                   ]

checkNFD :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFD (c1, c2, c3, c4, c5) =
    checkOp "toNFD" NFD $
            concat [ map (c3,) [c1, c2, c3]
                   , map (c5,) [c4, c5]
                   ]

checkNFKC :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFKC (c1, c2, c3, c4, c5) =
    checkOp "toNFKC" NFKC $ map (c4,) [c1, c2, c3, c4, c5]

checkNFKD :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFKD (c1, c2, c3, c4, c5) =
    checkOp "toNFKD" NFKD $ map (c5,) [c1, c2, c3, c4, c5]

checkAllTestCases :: Int -> String -> IO ()
checkAllTestCases lineno line = do
    case splitOn ";" line of
        c1 : c2 : c3 : c4 : c5 : _ -> do
            let cps = map cpToText [c1, c2, c3, c4, c5]
            mapM_ (checkOneTestCase cps)
                  [checkNFD, checkNFKD, checkNFC, checkNFKC]
        _ -> error $ "Unrecognized line: " ++ line
    where
        cpToText xs = T.pack $ map (chr . read . ("0x" ++)) (words xs)

        checkOneTestCase cps f = do
            res <- f (tuplify cps)
            when (not res) $ do
                putStrLn ("Failed at line: " ++ show lineno)
                putStrLn line
                putStrLn $ codes ++ "; # (" ++ txt
                error "Bailing out"
            where
                strs  = map T.unpack cps
                codes = intercalate ";" $ map strToHex strs
                txt   = intercalate "; " (map T.unpack cps)

        tuplify [c1, c2, c3, c4, c5] = (c1, c2, c3, c4, c5)
        tuplify _ = error "tuplify bad arguments"

checkLine :: (Int, String) -> IO ()
checkLine (lineno, line) = do
    -- marker lines indicating a test block start with @
    if "@" `isPrefixOf` line
        then
            putStrLn line
        else
            checkAllTestCases lineno line

testNormalize :: FilePath -> IO ()
testNormalize file = do
    contents <- T.unpack . T.decodeUtf8 <$> B.readFile file
    let ls = lines contents                        -- split into lines
         & map (dropWhile isSpace)                 -- trim leading spaces
         & zip [1..]                               -- add numbering
         & filter ((/= []) . snd)                  -- remove blank lines
         & filter (not . ("#" `isPrefixOf`) . snd) -- remove comments
    checkAll ls
    where
        checkAll (x:xs) = checkLine x >> checkAll xs
        checkAll []     = return ()

main :: IO ()
main = do
    testNormalize "test/data/NormalizationTest.txt"
    -- Additional test cases not in the unicode standard suite
    testNormalize "test/data/extra/NormalizationTest.txt"