1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
|
{-# LANGUAGE OverloadedStrings #-}
module Main (main) where
import Text.Collate
import Data.Char (chr, ord)
import Data.Text (Text)
import qualified Data.Text.IO as T
import qualified Data.Text as T
import qualified Data.Text.Read as TR
import Data.List (sortBy, find)
import System.Environment (getArgs)
import Control.Monad
import System.Exit
import Text.Printf
import System.IO
import Text.Collate.Normalize (toNFD)
main :: IO ()
main = do
args <- getArgs
let isHelp "-h" = True
isHelp "--help" = True
isHelp _ = False
when (any isHelp args) $ do
putStrLn "Usage: unicode-collate [COLLATION]"
putStrLn "Options:"
putStrLn " --hex Parse input as hex code points"
putStrLn " --help Print usage information"
putStrLn " --list List supported collations"
putStrLn " --verbose Include diagnostic information"
putStrLn ""
putStrLn "Sorts lines from stdin using the specified collation."
putStrLn "COLLATION is a BCP47 language code. Examples:"
putStrLn "unicode-collate # Use root collation"
putStrLn "unicode-collate es # Use standard Spanish collation"
putStrLn "unicode-collate es-u-co-trad # Use traditional Spanish"
putStrLn "unicode-collate fr-CA # Use Canadian French collation"
putStrLn "unicode-collate fr-u-kb # Use reverse accent order"
putStrLn "unicode-collate fr-u-ka-shifted # Use Shifted variable weighting"
exitSuccess
when ("--list" `elem` args) $ do
mapM_ (T.putStrLn . renderLang . fst) tailorings
exitSuccess
let isOpt ('-':_) = True
isOpt _ = False
spec <- maybe "root" T.pack . find (not . isOpt) <$> getArgs
lang <- either handleError return $ parseLang spec
let myCollator = collatorFor lang
let verbose = "--verbose" `elem` args
let codepoints = "--hex" `elem` args
let opts = collatorOptions myCollator
when verbose $ do
hPutStrLn stderr "Options:"
hPutStrLn stderr $ " Tailoring: " <>
maybe "ROOT" (T.unpack . renderLang) (optLang opts)
hPutStrLn stderr $ " Variable weighting: " <>
show (optVariableWeighting opts)
hPutStrLn stderr $ " French accents: " <>
show (optFrenchAccents opts)
hPutStrLn stderr $ " Upper before lower: " <>
show (optUpperBeforeLower opts)
hPutStrLn stderr $ " Normalize: " <>
show (optNormalize opts)
let renderLine t = do
t' <- if codepoints
then parseAsCodePoints t
else return t
when verbose $
hPutStrLn stderr $ renderCodePoints
(toNFD $ map ord $ T.unpack t') ++ "; # ("
++ T.unpack t' ++ ") " ++ renderSortKey (sortKey myCollator t')
T.putStrLn t'
T.getContents >>= mapM_ renderLine . sortBy (collate myCollator) . T.lines
renderCodePoints :: [Int] -> String
renderCodePoints = unwords . map (printf "%04X")
parseAsCodePoints :: Text -> IO Text
parseAsCodePoints t = do
let ws = T.words $ T.takeWhile (/=';') t -- everything after ; is ignored
cs <- mapM parseCodePoint ws
return $ T.pack $ map chr cs
parseCodePoint :: Text -> IO Int
parseCodePoint t =
case TR.hexadecimal t of
Right (x,t') | T.null t' -> return x
_ -> handleError $ "Could not parse " <> show t <> " as hex code point."
handleError :: String -> IO a
handleError msg = do
hPutStrLn stderr msg
exitWith $ ExitFailure 1
|