File: unit.hs

package info (click to toggle)
haskell-unicode-collation 0.1.3.6-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 10,004 kB
sloc: haskell: 1,566; makefile: 3
file content (229 lines) | stat: -rw-r--r-- 9,615 bytes
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Main (main) where
import Text.Collate
import Text.Collate.Normalize (toNFD)
import Text.Printf
import Test.Tasty
import Test.Tasty.QuickCheck
import Test.Tasty.HUnit
import Data.Either (lefts)
import Data.List (sortBy)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Read as TR
import qualified Data.Text.Normalize as N
import Data.Char
import Data.Maybe
import qualified Data.ByteString.Char8 as B8

main :: IO ()
main = do
  conformanceTree <- conformanceTests
  defaultMain (tests conformanceTree)

tests :: TestTree -> TestTree
tests conformanceTree = testGroup "Tests"
  [ conformanceTree
  , testCase "Sorting test 1" $
    sortBy (collate ourCollator) ["hi", "hit", "hít", "hat", "hot",
                       "naïve", "nag", "name"] @?=
           ["hat","hi","hit","h\237t","hot","nag","naïve","name"]
  , testCase "Sorting test 2" $
    sortBy (collate ourCollator)
                      ["ｶ", "ヵ", "abc", "abç", "ab\xFFFE\&c", "ab©",
                       "𝒶bc", "abC", "𝕒bc", "File-3", "ガ", "が", "äbc", "カ",
                       "か", "Abc", "file-12", "filé-110"]
                      @?=
                       ["ab©", "abc", "abC", "𝒶bc", "𝕒bc", "Abc",
                       "abç", "äbc", "ab\xFFFE\&c",
                       "filé-110", "file-12", "File-3",
                       "か", "ヵ", "カ", "ｶ", "が", "ガ"]

  , testGroup "Variable ordering test"
     $ map variableOrderingCase
      [ (NonIgnorable,
        ["de luge" ,"de Luge" ,"de-luge" ,"de-Luge" ,"de\x2010luge"
        ,"de\x2010Luge" ,"death" ,"deluge" ,"deLuge" ,"demark"])
      , (Blanked,
        ["death" ,"de luge" ,"de-luge" ,"de\x2010luge" ,"deluge"
        ,"de Luge" ,"de-Luge","de\x2010Luge", "deLuge", "demark"])
      , (Shifted,
        ["death" ,"de luge" ,"de-luge" ,"de\x2010luge" ,"deluge"
        ,"de Luge" ,"de-Luge" ,"de\x2010Luge" ,"deLuge" ,"demark"])
      , (ShiftTrimmed,
        ["death" ,"deluge" ,"de luge" ,"de-luge" ,"de\x2010luge" ,"deLuge"
        ,"de Luge" ,"de-Luge" ,"de\x2010Luge" ,"demark"])
      ]
  , testGroup "Localized collations"
    [ testCase "root cha cza" $
        collate "und" "cha" "cza" @?= LT
    , testCase "es traditional cha cza" $
        collate "es-u-co-trad" "cha" "cza" @?= GT
    , testCase "se ö z" $
        collate "se" "ö" "z" @?= GT
    , testCase "tr ö z" $
        collate "tr" "ö" "z" @?= LT
    , testCase "fr-CA sorted list" $
        sortBy (collate (collatorFor "fr-CA-u-kb-true"))
        ["déjà","Meme", "deja", "même", "dejà", "bpef", "bœg", "Boef", "Mémé",
         "bœf", "boef", "bnef", "pêche", "pèché", "pêché", "pêche", "pêché"]
         @?=
        ["bnef", "boef", "Boef", "bœf", "bœg", "bpef", "deja", "dejà", "déjà",
         "Meme", "même", "Mémé", "pêche", "pêche", "pèché", "pêché", "pêché"]
    , testCase "fr with French accents" $
        collate "fr-u-kb-true" "coté" "côte" @?= GT
    , testCase "fr without French accents" $
        collate "fr-u-kb-false" "coté" "côte" @?= LT
    , testCase "fr kb defaults to true" $
        collate "fr-u-kb" "coté" "côte" @?= GT
    , testCase "fr without kb defaults to false" $
        collate "fr" "coté" "côte" @?= LT
    , testCase "en with shifted" $
        collate "en-u-ka-shifted" "de-luge" "de Luge" @?= LT
    , testCase "en with nonignorable" $
        collate "en-u-ka-noignore" "de-luge" "de Luge" @?= GT
    , testCase "de-u-co-phonebk" $
        sortBy (collate "de-u-co-phonebk")
        ["Übelacker", "Üxküll", "Uell", "Ülle", "Udet", "Uffenbach", "Ueve"]
        @?=
        ["Udet", "Übelacker", "Uell", "Ülle", "Ueve", "Üxküll", "Uffenbach"]
    , testCase "zh-u-co-pinyin" $
        collate "zh-u-co-pinyin" "\x963F" "\x5475"  @?= LT
    , testCase "zh-u-co-stroke" $
        collate "zh-u-co-stroke" "\x963F" "\x5475"  @?= GT
    ]
  , testCase "QuasiQuotes" $
       collate [collator|zh-u-co-pinyin|] "\x963F" "\x5475" @?= LT
  , testGroup "BCP 47 Lang parsing"
       (map langParseTest langPairs)
  , testGroup "BCP 47 Lang round-trip"
       (map langRoundTripTest langPairs)
  , testGroup "Lang fallback behavior"
    [ testCase "de => ROOT" $
      (optLang . collatorOptions) "de" @?= Nothing
    , testCase "de-AT => ROOT" $
        (optLang . collatorOptions) "dt-AT" @?= Nothing
    , testCase "es-u-co-trad" $
        (optLang . collatorOptions) "es-ES" @?=
          Just (Lang "es" Nothing Nothing [] [] [])
    , testCase "de-DE-u-co-phonebk" $
        (optLang . collatorOptions) "de-DE-u-co-phonebk" @?=
          Just (Lang "de" Nothing Nothing [] [("u",[("co","phonebk")])] [])
    , testCase "es-u-co-nonexist-kb" $
        (optLang . collatorOptions) "es-u-co-nonexist-kb" @?=
          Just (Lang "es" Nothing Nothing [] [] [])
    ]
  , testGroup "Normalization"
    [ testProperty "toNFD agrees with unicode-transforms"
         (\cs -> toNFD (map ord cs) ==
           (map ord . T.unpack . N.normalize N.NFD . T.pack) cs)
    ]
  ]

emptyLang :: Lang
emptyLang = Lang mempty mempty mempty mempty mempty mempty

langPairs :: [(Text, Lang)]
langPairs = [ ("en", emptyLang{langLanguage = "en"})
            , ("en with garbage following",
                     emptyLang{langLanguage = "en"})
            , ("en--with garbage following", -- CSL allows this
                     emptyLang{langLanguage = "en"})
            , ("en-US", emptyLang{langLanguage = "en", langRegion = Just "US"})
            , ("sr_Latn_RS", emptyLang{langLanguage = "sr",
                                   langScript = Just "Latn",
                                   langRegion = Just "RS"})
            , ("es-419", emptyLang{langLanguage = "es",
                                   langRegion = Just "419"})
            , ("de-CH-1996", emptyLang{langLanguage = "de",
                                       langRegion = Just "CH",
                                       langVariants = ["1996"]})
            , ("en-u-kr-latin-digit", emptyLang{langLanguage = "en",
                     langExtensions = [("u", [("kr", "latin-digit")])]})
            ]

langParseTest :: (Text, Lang) -> TestTree
langParseTest (t, l) =
  testCase (T.unpack t) $ parseLang t @?= Right l

langRoundTripTest :: (Text, Lang) -> TestTree
langRoundTripTest (_,l) =
  let l' = renderLang l
   in testCase (T.unpack l') $ renderLang <$> parseLang l' @?= Right l'

conformanceTests :: IO TestTree
conformanceTests = do
  putStrLn "Loading conformance test data..."
  shifted <- conformanceTestsFor Shifted
              "test/uca-collation-test/CollationTest_SHIFTED_SHORT.txt"
  nonIgnorable <- conformanceTestsFor NonIgnorable
              "test/uca-collation-test/CollationTest_NON_IGNORABLE_SHORT.txt"
  return $ testGroup "Conformance tests" [nonIgnorable, shifted]

conformanceTestsFor :: VariableWeighting -> FilePath -> IO TestTree
conformanceTestsFor weighting fp = do
  xs <- parseConformanceTest fp
  let coll = setVariableWeighting weighting rootCollator
  return $ testCase ("Conformance tests " ++ show weighting ++ " " ++ fp)
         $ (\zs -> case lefts zs of
                     [] -> return ()
                     es -> assertFailure (unlines es))
         $ map (conformanceTestWith coll)
              (zip3 (map fst xs) (map snd xs) (drop 1 (map snd xs)))

conformanceTestWith :: Collator -> (Int, Text, Text) -> Either String ()
conformanceTestWith coll (lineNo, txt1, txt2) =
  let showHexes = unwords . map ((\c -> if c > 0xFFFF
                                           then printf "%05X" c
                                           else printf "%04X" c) . ord)
                          . T.unpack
   in if collate coll txt1 txt2 /= GT
         then Right ()
         else Left $ "[line " ++ show lineNo ++ "] " ++
                      showHexes txt1 ++ " <= " ++ showHexes txt2 ++ "\n" ++
                      "  Calculated sort keys:\n  [" ++
                        showHexes txt1 ++ "] " ++
                        renderSortKey (sortKey coll txt1) ++ "\n  [" ++
                        showHexes txt2 ++ "] " ++
                        renderSortKey (sortKey coll txt2)

variableOrderingCase :: (VariableWeighting , [Text]) -> TestTree
variableOrderingCase (w , expected) =
  testCase (show w) $
     sortBy (collate (setVariableWeighting w rootCollator))
           -- from Table 12
           [ "de luge"
           , "de Luge"
           , "de-luge"
           , "de-Luge"
           , "de\x2010luge"
           , "de\x2010Luge"
           , "death"
           , "deluge"
           , "deLuge"
           , "demark" ]
           @?= expected

ourCollator :: Collator
ourCollator = setVariableWeighting Shifted rootCollator

parseConformanceTest :: FilePath -> IO [(Int, Text)]
parseConformanceTest fp = do
  bs <- B8.readFile fp
  let beginsWithHexDigit = maybe False (isHexDigit . fst) . B8.uncons
  let allLines = B8.lines bs
  let prologue = takeWhile (not . beginsWithHexDigit) allLines
  let lns = drop (length prologue) allLines
  let firstLine = 1 + length prologue
  return $ catMaybes $ zipWith parseConformanceTestLine [firstLine..] lns

parseConformanceTestLine :: Int -> B8.ByteString -> Maybe (Int, Text)
parseConformanceTestLine lineno bs =
  let readhex = either error fst . TR.hexadecimal
      codepoints = map (readhex . TE.decodeLatin1) $ B8.words bs
      t = T.pack (map chr codepoints)
   in if T.any (== '\xFFFD') t || T.null t
         then Nothing
         else Just (lineno, t)