File: unit.hs

package info (click to toggle)
haskell-unicode-collation 0.1.3.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 10,004 kB
  • sloc: haskell: 1,566; makefile: 3
file content (229 lines) | stat: -rw-r--r-- 9,615 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Main (main) where
import Text.Collate
import Text.Collate.Normalize (toNFD)
import Text.Printf
import Test.Tasty
import Test.Tasty.QuickCheck
import Test.Tasty.HUnit
import Data.Either (lefts)
import Data.List (sortBy)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Read as TR
import qualified Data.Text.Normalize as N
import Data.Char
import Data.Maybe
import qualified Data.ByteString.Char8 as B8

main :: IO ()
main = do
  conformanceTree <- conformanceTests
  defaultMain (tests conformanceTree)

tests :: TestTree -> TestTree
tests conformanceTree = testGroup "Tests"
  [ conformanceTree
  , testCase "Sorting test 1" $
    sortBy (collate ourCollator) ["hi", "hit", "hít", "hat", "hot",
                       "naïve", "nag", "name"] @?=
           ["hat","hi","hit","h\237t","hot","nag","naïve","name"]
  , testCase "Sorting test 2" $
    sortBy (collate ourCollator)
                      ["カ", "ヵ", "abc", "abç", "ab\xFFFE\&c", "ab©",
                       "𝒶bc", "abC", "𝕒bc", "File-3", "ガ", "が", "äbc", "カ",
                       "か", "Abc", "file-12", "filé-110"]
                      @?=
                       ["ab©", "abc", "abC", "𝒶bc", "𝕒bc", "Abc",
                       "abç", "äbc", "ab\xFFFE\&c",
                       "filé-110", "file-12", "File-3",
                       "か", "ヵ", "カ", "カ", "が", "ガ"]

  , testGroup "Variable ordering test"
     $ map variableOrderingCase
      [ (NonIgnorable,
        ["de luge" ,"de Luge" ,"de-luge" ,"de-Luge" ,"de\x2010luge"
        ,"de\x2010Luge" ,"death" ,"deluge" ,"deLuge" ,"demark"])
      , (Blanked,
        ["death" ,"de luge" ,"de-luge" ,"de\x2010luge" ,"deluge"
        ,"de Luge" ,"de-Luge","de\x2010Luge", "deLuge", "demark"])
      , (Shifted,
        ["death" ,"de luge" ,"de-luge" ,"de\x2010luge" ,"deluge"
        ,"de Luge" ,"de-Luge" ,"de\x2010Luge" ,"deLuge" ,"demark"])
      , (ShiftTrimmed,
        ["death" ,"deluge" ,"de luge" ,"de-luge" ,"de\x2010luge" ,"deLuge"
        ,"de Luge" ,"de-Luge" ,"de\x2010Luge" ,"demark"])
      ]
  , testGroup "Localized collations"
    [ testCase "root cha cza" $
        collate "und" "cha" "cza" @?= LT
    , testCase "es traditional cha cza" $
        collate "es-u-co-trad" "cha" "cza" @?= GT
    , testCase "se ö z" $
        collate "se" "ö" "z" @?= GT
    , testCase "tr ö z" $
        collate "tr" "ö" "z" @?= LT
    , testCase "fr-CA sorted list" $
        sortBy (collate (collatorFor "fr-CA-u-kb-true"))
        ["déjà","Meme", "deja", "même", "dejà", "bpef", "bœg", "Boef", "Mémé",
         "bœf", "boef", "bnef", "pêche", "pèché", "pêché", "pêche", "pêché"]
         @?=
        ["bnef", "boef", "Boef", "bœf", "bœg", "bpef", "deja", "dejà", "déjà",
         "Meme", "même", "Mémé", "pêche", "pêche", "pèché", "pêché", "pêché"]
    , testCase "fr with French accents" $
        collate "fr-u-kb-true" "coté" "côte" @?= GT
    , testCase "fr without French accents" $
        collate "fr-u-kb-false" "coté" "côte" @?= LT
    , testCase "fr kb defaults to true" $
        collate "fr-u-kb" "coté" "côte" @?= GT
    , testCase "fr without kb defaults to false" $
        collate "fr" "coté" "côte" @?= LT
    , testCase "en with shifted" $
        collate "en-u-ka-shifted" "de-luge" "de Luge" @?= LT
    , testCase "en with nonignorable" $
        collate "en-u-ka-noignore" "de-luge" "de Luge" @?= GT
    , testCase "de-u-co-phonebk" $
        sortBy (collate "de-u-co-phonebk")
        ["Übelacker", "Üxküll", "Uell", "Ülle", "Udet", "Uffenbach", "Ueve"]
        @?=
        ["Udet", "Übelacker", "Uell", "Ülle", "Ueve", "Üxküll", "Uffenbach"]
    , testCase "zh-u-co-pinyin" $
        collate "zh-u-co-pinyin" "\x963F" "\x5475"  @?= LT
    , testCase "zh-u-co-stroke" $
        collate "zh-u-co-stroke" "\x963F" "\x5475"  @?= GT
    ]
  , testCase "QuasiQuotes" $
       collate [collator|zh-u-co-pinyin|] "\x963F" "\x5475" @?= LT
  , testGroup "BCP 47 Lang parsing"
       (map langParseTest langPairs)
  , testGroup "BCP 47 Lang round-trip"
       (map langRoundTripTest langPairs)
  , testGroup "Lang fallback behavior"
    [ testCase "de => ROOT" $
      (optLang . collatorOptions) "de" @?= Nothing
    , testCase "de-AT => ROOT" $
        (optLang . collatorOptions) "dt-AT" @?= Nothing
    , testCase "es-u-co-trad" $
        (optLang . collatorOptions) "es-ES" @?=
          Just (Lang "es" Nothing Nothing [] [] [])
    , testCase "de-DE-u-co-phonebk" $
        (optLang . collatorOptions) "de-DE-u-co-phonebk" @?=
          Just (Lang "de" Nothing Nothing [] [("u",[("co","phonebk")])] [])
    , testCase "es-u-co-nonexist-kb" $
        (optLang . collatorOptions) "es-u-co-nonexist-kb" @?=
          Just (Lang "es" Nothing Nothing [] [] [])
    ]
  , testGroup "Normalization"
    [ testProperty "toNFD agrees with unicode-transforms"
         (\cs -> toNFD (map ord cs) ==
           (map ord . T.unpack . N.normalize N.NFD . T.pack) cs)
    ]
  ]

emptyLang :: Lang
emptyLang = Lang mempty mempty mempty mempty mempty mempty

langPairs :: [(Text, Lang)]
langPairs = [ ("en", emptyLang{langLanguage = "en"})
            , ("en with garbage following",
                     emptyLang{langLanguage = "en"})
            , ("en--with garbage following", -- CSL allows this
                     emptyLang{langLanguage = "en"})
            , ("en-US", emptyLang{langLanguage = "en", langRegion = Just "US"})
            , ("sr_Latn_RS", emptyLang{langLanguage = "sr",
                                   langScript = Just "Latn",
                                   langRegion = Just "RS"})
            , ("es-419", emptyLang{langLanguage = "es",
                                   langRegion = Just "419"})
            , ("de-CH-1996", emptyLang{langLanguage = "de",
                                       langRegion = Just "CH",
                                       langVariants = ["1996"]})
            , ("en-u-kr-latin-digit", emptyLang{langLanguage = "en",
                     langExtensions = [("u", [("kr", "latin-digit")])]})
            ]

langParseTest :: (Text, Lang) -> TestTree
langParseTest (t, l) =
  testCase (T.unpack t) $ parseLang t @?= Right l

langRoundTripTest :: (Text, Lang) -> TestTree
langRoundTripTest (_,l) =
  let l' = renderLang l
   in testCase (T.unpack l') $ renderLang <$> parseLang l' @?= Right l'

conformanceTests :: IO TestTree
conformanceTests = do
  putStrLn "Loading conformance test data..."
  shifted <- conformanceTestsFor Shifted
              "test/uca-collation-test/CollationTest_SHIFTED_SHORT.txt"
  nonIgnorable <- conformanceTestsFor NonIgnorable
              "test/uca-collation-test/CollationTest_NON_IGNORABLE_SHORT.txt"
  return $ testGroup "Conformance tests" [nonIgnorable, shifted]

conformanceTestsFor :: VariableWeighting -> FilePath -> IO TestTree
conformanceTestsFor weighting fp = do
  xs <- parseConformanceTest fp
  let coll = setVariableWeighting weighting rootCollator
  return $ testCase ("Conformance tests " ++ show weighting ++ " " ++ fp)
         $ (\zs -> case lefts zs of
                     [] -> return ()
                     es -> assertFailure (unlines es))
         $ map (conformanceTestWith coll)
              (zip3 (map fst xs) (map snd xs) (drop 1 (map snd xs)))

conformanceTestWith :: Collator -> (Int, Text, Text) -> Either String ()
conformanceTestWith coll (lineNo, txt1, txt2) =
  let showHexes = unwords . map ((\c -> if c > 0xFFFF
                                           then printf "%05X" c
                                           else printf "%04X" c) . ord)
                          . T.unpack
   in if collate coll txt1 txt2 /= GT
         then Right ()
         else Left $ "[line " ++ show lineNo ++ "] " ++
                      showHexes txt1 ++ " <= " ++ showHexes txt2 ++ "\n" ++
                      "  Calculated sort keys:\n  [" ++
                        showHexes txt1 ++ "] " ++
                        renderSortKey (sortKey coll txt1) ++ "\n  [" ++
                        showHexes txt2 ++ "] " ++
                        renderSortKey (sortKey coll txt2)

variableOrderingCase :: (VariableWeighting , [Text]) -> TestTree
variableOrderingCase (w , expected) =
  testCase (show w) $
     sortBy (collate (setVariableWeighting w rootCollator))
           -- from Table 12
           [ "de luge"
           , "de Luge"
           , "de-luge"
           , "de-Luge"
           , "de\x2010luge"
           , "de\x2010Luge"
           , "death"
           , "deluge"
           , "deLuge"
           , "demark" ]
           @?= expected

ourCollator :: Collator
ourCollator = setVariableWeighting Shifted rootCollator

parseConformanceTest :: FilePath -> IO [(Int, Text)]
parseConformanceTest fp = do
  bs <- B8.readFile fp
  let beginsWithHexDigit = maybe False (isHexDigit . fst) . B8.uncons
  let allLines = B8.lines bs
  let prologue = takeWhile (not . beginsWithHexDigit) allLines
  let lns = drop (length prologue) allLines
  let firstLine = 1 + length prologue
  return $ catMaybes $ zipWith parseConformanceTestLine [firstLine..] lns

parseConformanceTestLine :: Int -> B8.ByteString -> Maybe (Int, Text)
parseConformanceTestLine lineno bs =
  let readhex = either error fst . TR.hexadecimal
      codepoints = map (readhex . TE.decodeLatin1) $ B8.words bs
      t = T.pack (map chr codepoints)
   in if T.any (== '\xFFFD') t || T.null t
         then Nothing
         else Just (lineno, t)