1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
|
{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
-- |
-- Copyright : (c) 2016 Harendra Kumar
--
-- License : BSD-3-Clause
-- Maintainer : harendra.kumar@gmail.com
-- Stability : experimental
-- Portability : GHC
--
import Control.Monad (when)
import qualified Data.ByteString as B
import Data.Char (chr, isSpace, ord, toUpper)
#if MIN_VERSION_base(4,8,0)
import Data.Function ((&))
#endif
import Data.List (intercalate, isPrefixOf)
import Data.List.Split (splitOn)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.Text.Normalize (NormalizationMode(NFD, NFKD, NFC, NFKC), normalize)
import Text.Printf (printf)
#if !MIN_VERSION_base(4,8,0)
(&) :: a -> (a -> b) -> b
x & f = f x
#endif
chrToHex :: Char -> [Char]
chrToHex = (map toUpper) . (printf "%.4x") . ord
strToHex :: [Char] -> String
strToHex = unwords . (map chrToHex)
checkEqual :: String -> (Text -> Text) -> (Text, Text) -> IO Bool
checkEqual opName op (c1, c2) =
if c1 /= op c2 then do
putStrLn $ opName ++ " " ++ txtToHex c2
++ " = " ++ txtToHex (op c2)
++ "; Expected: " ++ txtToHex c1
return False
else return True
where
txtToHex = strToHex . T.unpack
checkOp :: String -> NormalizationMode -> [(Text, Text)] -> IO Bool
checkOp name op pairs = do
res <- mapM (checkEqual name ((normalize op))) pairs
return $ all (== True) res
checkNFC :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFC (c1, c2, c3, c4, c5) =
checkOp "toNFC" NFC $
concat [ map (c2,) [c1, c2, c3]
, map (c4,) [c4, c5]
]
checkNFD :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFD (c1, c2, c3, c4, c5) =
checkOp "toNFD" NFD $
concat [ map (c3,) [c1, c2, c3]
, map (c5,) [c4, c5]
]
checkNFKC :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFKC (c1, c2, c3, c4, c5) =
checkOp "toNFKC" NFKC $ map (c4,) [c1, c2, c3, c4, c5]
checkNFKD :: (Text, Text, Text, Text, Text) -> IO Bool
checkNFKD (c1, c2, c3, c4, c5) =
checkOp "toNFKD" NFKD $ map (c5,) [c1, c2, c3, c4, c5]
checkAllTestCases :: Int -> String -> IO ()
checkAllTestCases lineno line = do
case splitOn ";" line of
c1 : c2 : c3 : c4 : c5 : _ -> do
let cps = map cpToText [c1, c2, c3, c4, c5]
mapM_ (checkOneTestCase cps)
[checkNFD, checkNFKD, checkNFC, checkNFKC]
_ -> error $ "Unrecognized line: " ++ line
where
cpToText xs = T.pack $ map (chr . read . ("0x" ++)) (words xs)
checkOneTestCase cps f = do
res <- f (tuplify cps)
when (not res) $ do
putStrLn ("Failed at line: " ++ show lineno)
putStrLn line
putStrLn $ codes ++ "; # (" ++ txt
error "Bailing out"
where
strs = map T.unpack cps
codes = intercalate ";" $ map strToHex strs
txt = intercalate "; " (map T.unpack cps)
tuplify [c1, c2, c3, c4, c5] = (c1, c2, c3, c4, c5)
tuplify _ = error "tuplify bad arguments"
checkLine :: (Int, String) -> IO ()
checkLine (lineno, line) = do
-- marker lines indicating a test block start with @
if "@" `isPrefixOf` line
then
putStrLn line
else
checkAllTestCases lineno line
testNormalize :: FilePath -> IO ()
testNormalize file = do
contents <- T.unpack . T.decodeUtf8 <$> B.readFile file
let ls = lines contents -- split into lines
& map (dropWhile isSpace) -- trim leading spaces
& zip [1..] -- add numbering
& filter ((/= []) . snd) -- remove blank lines
& filter (not . ("#" `isPrefixOf`) . snd) -- remove comments
checkAll ls
where
checkAll (x:xs) = checkLine x >> checkAll xs
checkAll [] = return ()
main :: IO ()
main = do
testNormalize "test/data/NormalizationTest.txt"
-- Additional test cases not in the unicode standard suite
testNormalize "test/data/extra/NormalizationTest.txt"
|